Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)

Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>

Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)
Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
c0bd6a68 · RoadToNowhereX · GitHub · 3144d902 · c0bd6a68
Unverified Commit c0bd6a68 authored Sep 10, 2025 by RoadToNowhereX Committed by GitHub Sep 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

vllm/model_executor/layers/quantization/auto_round.py vllm/model_executor/layers/quantization/auto_round.py +2 -1

No files found.
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig):

        if isinstance(layer, FusedMoE):
            if use_marlin:
+                return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
+            else:
                from vllm.model_executor.layers.quantization.moe_wna16 import (
                    MoeWNA16Config)

@@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig):
                }
                return MoeWNA16Config.from_config(config).get_quant_method(
                    layer, prefix)
-            return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)

        if isinstance(layer, (LinearBase, ParallelLMHead)):
            if use_marlin: