Unverified Commit c0bd6a68 authored by RoadToNowhereX's avatar RoadToNowhereX Committed by GitHub
Browse files

Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)


Signed-off-by: default avatarRoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
parent 3144d902
......@@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig):
if isinstance(layer, FusedMoE):
if use_marlin:
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
else:
from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config)
......@@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig):
}
return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix)
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
if isinstance(layer, (LinearBase, ParallelLMHead)):
if use_marlin:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment