Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
70d5953f
Unverified
Commit
70d5953f
authored
Nov 26, 2025
by
Huamin Li
Committed by
GitHub
Nov 26, 2025
Browse files
Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)" (#29483)
Signed-off-by:
Huamin Li
<
3ericli@gmail.com
>
parent
3650a74e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
7 additions
and
24 deletions
+7
-24
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+0
-1
tests/compile/distributed/test_fusions_e2e.py
tests/compile/distributed/test_fusions_e2e.py
+0
-11
vllm/distributed/device_communicators/symm_mem.py
vllm/distributed/device_communicators/symm_mem.py
+1
-1
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+6
-11
No files found.
.buildkite/test-pipeline.yaml
View file @
70d5953f
...
@@ -972,7 +972,6 @@ steps:
...
@@ -972,7 +972,6 @@ steps:
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/layernorm.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/activation.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
vllm/model_executor/layers/quantization/input_quant_fp8.py
-
vllm/model_executor/layers/fused_moe/layer.py
-
tests/compile/test_fusion_attn.py
-
tests/compile/test_fusion_attn.py
-
tests/compile/test_silu_mul_quant_fusion.py
-
tests/compile/test_silu_mul_quant_fusion.py
-
tests/compile/distributed/test_fusion_all_reduce.py
-
tests/compile/distributed/test_fusion_all_reduce.py
...
...
tests/compile/distributed/test_fusions_e2e.py
View file @
70d5953f
...
@@ -111,17 +111,6 @@ if current_platform.is_cuda():
...
@@ -111,17 +111,6 @@ if current_platform.is_cuda():
async_tp
=
96
,
# MLP is MoE, half the fusions of dense
async_tp
=
96
,
# MLP is MoE, half the fusions of dense
),
),
),
),
ModelBackendTestCase
(
model_name
=
"openai/gpt-oss-20b"
,
model_kwargs
=
dict
(
max_model_len
=
1024
,
kv_cache_dtype
=
"fp8"
),
backend
=
AttentionBackendEnum
.
FLASHINFER
,
matches
=
Matches
(
attention_fusion
=
0
,
allreduce_fusion
=
49
,
sequence_parallel
=
49
,
async_tp
=
48
,
),
),
]
]
elif
current_platform
.
is_rocm
():
elif
current_platform
.
is_rocm
():
...
...
vllm/distributed/device_communicators/symm_mem.py
View file @
70d5953f
...
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
...
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
return
None
return
None
if
out
is
None
:
if
out
is
None
:
out
=
torch
.
empty_like
(
inp
)
out
=
torch
.
empty_like
(
inp
)
self
.
buffer
[:
inp
.
numel
()].
copy_
(
inp
.
reshape
(
-
1
))
self
.
buffer
[:
inp
.
numel
()].
copy_
(
inp
.
view
(
-
1
))
# Determine which algorithm to use
# Determine which algorithm to use
use_multimem
=
False
use_multimem
=
False
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
70d5953f
...
@@ -1690,10 +1690,6 @@ class FusedMoE(CustomOp):
...
@@ -1690,10 +1690,6 @@ class FusedMoE(CustomOp):
)
)
def
reduce_output
(
states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
reduce_output
(
states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# Slice before all_reduce to enable possible fusion
if
self
.
hidden_size
!=
og_hidden_states
:
states
=
states
[...,
:
og_hidden_states
]
if
(
if
(
not
self
.
is_sequence_parallel
not
self
.
is_sequence_parallel
and
not
self
.
use_dp_chunking
and
not
self
.
use_dp_chunking
...
@@ -1716,12 +1712,11 @@ class FusedMoE(CustomOp):
...
@@ -1716,12 +1712,11 @@ class FusedMoE(CustomOp):
if
self
.
zero_expert_num
is
not
None
and
self
.
zero_expert_num
>
0
:
if
self
.
zero_expert_num
is
not
None
and
self
.
zero_expert_num
>
0
:
assert
isinstance
(
fused_output
,
tuple
)
assert
isinstance
(
fused_output
,
tuple
)
fused_output
,
zero_expert_result
=
fused_output
fused_output
,
zero_expert_result
=
fused_output
return
(
return
(
reduce_output
(
fused_output
)
+
zero_expert_result
)[
reduce_output
(
fused_output
)
...,
:
og_hidden_states
+
zero_expert_result
[...,
:
og_hidden_states
]
]
)
else
:
else
:
return
reduce_output
(
fused_output
)
return
reduce_output
(
fused_output
)
[...,
:
og_hidden_states
]
else
:
else
:
if
current_platform
.
is_tpu
():
if
current_platform
.
is_tpu
():
# TODO: Once the OOM issue for the TPU backend is resolved, we
# TODO: Once the OOM issue for the TPU backend is resolved, we
...
@@ -1734,8 +1729,8 @@ class FusedMoE(CustomOp):
...
@@ -1734,8 +1729,8 @@ class FusedMoE(CustomOp):
hidden_states
,
router_logits
,
self
.
layer_name
hidden_states
,
router_logits
,
self
.
layer_name
)
)
return
(
return
(
reduce_output
(
shared_output
),
reduce_output
(
shared_output
)
[...,
:
og_hidden_states
]
,
reduce_output
(
fused_output
),
reduce_output
(
fused_output
)
[...,
:
og_hidden_states
]
,
)
)
def
forward_cuda
(
def
forward_cuda
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment