Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2e25bb12
Unverified
Commit
2e25bb12
authored
Jul 02, 2025
by
bnellnm
Committed by
GitHub
Jul 03, 2025
Browse files
[Bugfix] Fix import of CutlassExpertsFp8 in compressed_tensors_moe.py (#20381)
Signed-off-by:
Bill Nell
<
bnell@redhat.com
>
parent
9965c47d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
7 deletions
+11
-7
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+5
-3
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+6
-4
No files found.
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
2e25bb12
...
...
@@ -14,9 +14,9 @@ import vllm.envs as envs
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
(
CutlassExpertsFp8
,
FusedMoE
,
FusedMoEActivationFormat
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
,
FusedMoeWeightScaleSupported
,
fused_experts
)
FusedMoE
,
FusedMoEActivationFormat
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
,
FusedMoeWeightScaleSupported
)
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16
import
(
# noqa
WNA16_SUPPORTED_BITS
,
WNA16_SUPPORTED_TYPES_MAP
)
from
vllm.model_executor.layers.quantization.utils
import
replace_parameter
...
...
@@ -570,6 +570,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
del
layer
.
w2_input_scale
self
.
fused_experts_func
=
None
else
:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
self
.
fused_experts_func
=
fused_experts
def
apply
(
...
...
@@ -826,6 +827,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
moe
:
FusedMoEConfig
,
)
->
FusedMoEPermuteExpertsUnpermute
:
from
vllm.model_executor.layers.fused_moe
import
CutlassExpertsFp8
use_batched_format
=
(
prepare_finalize
.
activation_format
==
FusedMoEActivationFormat
.
BatchedExperts
)
...
...
vllm/model_executor/layers/quantization/fp8.py
View file @
2e25bb12
...
...
@@ -14,10 +14,9 @@ from vllm import _custom_ops as ops
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
(
BatchedTritonOrDeepGemmExperts
,
FusedMoE
,
FusedMoEActivationFormat
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
,
FusedMoeWeightScaleSupported
,
TritonOrDeepGemmExperts
)
FusedMoE
,
FusedMoEActivationFormat
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
,
FusedMoeWeightScaleSupported
)
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
UnquantizedLinearMethod
)
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
...
...
@@ -785,6 +784,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
moe
:
FusedMoEConfig
,
)
->
FusedMoEPermuteExpertsUnpermute
:
from
vllm.model_executor.layers.fused_moe
import
(
BatchedTritonOrDeepGemmExperts
,
TritonOrDeepGemmExperts
)
assert
not
self
.
use_marlin
and
not
self
.
rocm_aiter_moe_enabled
,
(
"Marlin and ROCm AITER are not supported with all2all yet."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment