Unverified Commit c0bd6a68 authored by RoadToNowhereX's avatar RoadToNowhereX Committed by GitHub
Browse files

Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)


Signed-off-by: default avatarRoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
parent 3144d902
...@@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig):
if isinstance(layer, FusedMoE): if isinstance(layer, FusedMoE):
if use_marlin: if use_marlin:
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
else:
from vllm.model_executor.layers.quantization.moe_wna16 import ( from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config) MoeWNA16Config)
...@@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig):
} }
return MoeWNA16Config.from_config(config).get_quant_method( return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix) layer, prefix)
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
if isinstance(layer, (LinearBase, ParallelLMHead)): if isinstance(layer, (LinearBase, ParallelLMHead)):
if use_marlin: if use_marlin:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment