Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bbd14169
Commit
bbd14169
authored
Jun 21, 2025
by
zhuwenwen
Browse files
skip is_rocm_aiter_moe_enabled and remove qwen unused int8 code
parent
d6a856f5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
56 deletions
+11
-56
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+8
-8
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+3
-4
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+0
-44
No files found.
vllm/model_executor/layers/fused_moe/layer.py
View file @
bbd14169
...
...
@@ -25,8 +25,8 @@ from vllm.distributed import (get_dp_group, get_ep_group,
from
vllm.forward_context
import
ForwardContext
,
get_forward_context
from
vllm.logger
import
init_logger
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
is_rocm_aiter_moe_enabled
)
#
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
#
is_rocm_aiter_moe_enabled)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
...
...
@@ -55,11 +55,11 @@ else:
fused_experts
=
None
# type: ignore
FusedMoEPermuteExpertsUnpermute
=
None
# type: ignore
FusedMoEPrepareAndFinalize
=
None
# type: ignore
if
is_rocm_aiter_moe_enabled
():
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
# noqa: E501
rocm_aiter_grouped_topk
as
grouped_topk
)
else
:
from
vllm.model_executor.layers.fused_moe.fused_moe
import
grouped_topk
#
if is_rocm_aiter_moe_enabled():
#
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
#
rocm_aiter_grouped_topk as grouped_topk)
#
else:
from
vllm.model_executor.layers.fused_moe.fused_moe
import
grouped_topk
if
current_platform
.
is_tpu
():
from
.moe_pallas
import
fused_moe
as
fused_moe_pallas
else
:
...
...
@@ -443,7 +443,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
self
.
topk_indices_dtype
=
None
self
.
moe
=
moe
self
.
rocm_aiter_moe_enabled
=
is_rocm_aiter_moe_enabled
()
self
.
rocm_aiter_moe_enabled
=
False
#
is_rocm_aiter_moe_enabled()
if
self
.
rocm_aiter_moe_enabled
:
from
.rocm_aiter_fused_moe
import
rocm_aiter_fused_experts
self
.
rocm_aiter_fused_experts
=
rocm_aiter_fused_experts
...
...
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
bbd14169
...
...
@@ -35,10 +35,9 @@ class ActivationMethod(IntEnum):
@
cache
def
is_rocm_aiter_moe_enabled
()
->
bool
:
return
False
# return current_platform.is_rocm() \
# and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
return
current_platform
.
is_rocm
()
\
and
envs
.
VLLM_ROCM_USE_AITER_MOE
\
and
envs
.
VLLM_ROCM_USE_AITER
def
rocm_aiter_asm_moe_tkw1_impl
(
...
...
vllm/model_executor/models/qwen.py
View file @
bbd14169
...
...
@@ -383,50 +383,6 @@ class QWenBaseModel(nn.Module):
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"attn.c_attn.weight"
,
"attn.c_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.c_proj.weight"
,
]
combined_words
=
"|"
.
join
(
lay_key_words
)
weight_shapes
=
[]
all_json
=
{}
matched_key_words
=
set
()
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
and
"scale"
not
in
layername
:
weight_data
=
params_dict
[
layername
]
n
=
weight_data
.
shape
[
0
]
#rocblas和cutlass目前都需要weight做处理,但是triton不用
if
self
.
w8a8_strategy
!=
1
:
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
n
,
-
1
)
weight_data
.
data
.
copy_
(
_weight
)
#下面是针对模型记录模型出现k和n值
elif
len
(
matched_key_words
)
<
4
and
matches
[
0
]
not
in
matched_key_words
:
matched_key_words
.
add
(
matches
[
0
])
k
=
weight_data
.
shape
[
1
]
weight_shapes
.
append
({
n
,
k
})
json_file
=
self
.
tritonsingleton
.
get_w8a8json_name
(
n
,
k
)
configs_dict
=
self
.
tritonsingleton
.
get_triton_cache
(
json_file
,
n
,
k
)
if
configs_dict
:
all_json
.
update
(
configs_dict
)
if
self
.
w8a8_strategy
==
1
:
self
.
tritonsingleton
.
triton_json_dict
.
append
(
all_json
)
#找到的所有config都进行一次warmup
for
key
,
value
in
all_json
.
items
():
m
=
int
(
key
.
split
(
'_'
)[
0
])
n
=
int
(
key
.
split
(
'_'
)[
1
])
k
=
int
(
key
.
split
(
'_'
)[
2
])
ops
.
triton_int8_gemm_helper
(
m
=
m
,
n
=
n
,
k
=
k
,
per_token_act_quant
=
True
,
per_out_channel_weight_quant
=
True
,
use_bias
=
False
,
best_config
=
value
)
return
loaded_params
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment