Unverified Commit b9986454 authored by Srikanth Srinivas's avatar Srikanth Srinivas Committed by GitHub
Browse files

Fix for attention layers to remain unquantized during moe_wn16 quant (#12570)



Fix to AWQ quant loading of the new R1 model

The new optimized MoE kernels for a large number of experts `moe_wn16`
uses AWQ quant which requires the attention layers to be in 16bit

The current merge has broken this, and the `get_quant_method` must
return None for it to work correctly again

---------
Signed-off-by: default avatarSrikanth Srinivas <srikanth@astrum.ai>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: default avatarBeim <beim2015@outlook.com>
Signed-off-by: default avatarrshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: default avatarmgoin <michael@neuralmagic.com>
Signed-off-by: default avatarnpanpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: default avatarAleksandr Malyshev <maleksan@amd.com>
Signed-off-by: default avatarLucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: default avatarsimon-mo <xmo@berkeley.edu>
Signed-off-by: default avatarCody Yu <hao.yu.cody@gmail.com>
Signed-off-by: default avatarChen Zhang <zhangch99@outlook.com>
Signed-off-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: default avatarRyan N <ryan.nguyen@centml.ai>
Signed-off-by: default avatarBrian Dellabetta <bdellabe@redhat.com>
Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Signed-off-by: default avatarRahul Tuli <rahul@neuralmagic.com>
Signed-off-by: default avatarRussell Bryant <rbryant@redhat.com>
Signed-off-by: default avatarsimon-mo <simon.mo@hey.com>
Signed-off-by: default avatarVicente Herrera <vicenteherrera@vicenteherrera.com>
Signed-off-by: default avatarJinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: default avatarShawn Du <shawnd200@outlook.com>
Signed-off-by: default avatarKunshang Ji <kunshang.ji@intel.com>
Signed-off-by: default avataryoukaichao <youkaichao@gmail.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarBeim <805908499@qq.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: default avatarmgoin <michael@neuralmagic.com>
Co-authored-by: default avatarsimon-mo <xmo@berkeley.edu>
Co-authored-by: default avatarNishidha <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: default avatarLucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: default avatarAleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Co-authored-by: default avatarAleksandr Malyshev <maleksan@amd.com>
Co-authored-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: default avatarsimon-mo <simon.mo@hey.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
Co-authored-by: default avatarZhuohan Li <zhuohan123@gmail.com>
Co-authored-by: default avatarTyler Michael Smith <tysmith@redhat.com>
Co-authored-by: default avatarAlexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>
Co-authored-by: default avatarRoger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: default avatarCody Yu <hao.yu.cody@gmail.com>
Co-authored-by: default avatarChen Zhang <zhangch99@outlook.com>
Co-authored-by: default avatarKevin H. Luu <kevin@anyscale.com>
Co-authored-by: default avatarTyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: default avatarRyan Nguyen <96593302+xpbowler@users.noreply.github.com>
Co-authored-by: default avatarBrian Dellabetta <brian-dellabetta@users.noreply.github.com>
Co-authored-by: default avatarfade_away <1028552010@qq.com>
Co-authored-by: default avatarweilong.yu <weilong.yu@shopee.com>
Co-authored-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Co-authored-by: default avatarEldar Kurtic <eldarkurtic314@gmail.com>
Co-authored-by: default avatarRahul Tuli <rahul@neuralmagic.com>
Co-authored-by: default avatarRussell Bryant <rbryant@redhat.com>
Co-authored-by: default avatarVicente Herrera <vicenteherrera@vicenteherrera.com>
Co-authored-by: default avatarJinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: default avatarShawn Du <shawnd200@outlook.com>
Co-authored-by: default avatarKunshang Ji <kunshang.ji@intel.com>
Co-authored-by: default avataryoukaichao <youkaichao@gmail.com>
parent c5932e5d
......@@ -7,7 +7,8 @@ import torch
from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
from vllm.model_executor.layers.quantization.base_config import (
......@@ -125,9 +126,7 @@ class MoeWNA16Config(QuantizationConfig):
prefix: str) -> Optional["QuantizeMethodBase"]:
if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
return UnquantizedLinearMethod()
elif isinstance(layer, FusedMoE):
return MoeWNA16Method(self)
else:
elif isinstance(layer, LinearBase):
if self.linear_quant_method == "gptq":
if self.use_marlin:
return GPTQMarlinConfig.from_config(
......@@ -144,6 +143,9 @@ class MoeWNA16Config(QuantizationConfig):
self.full_config).get_quant_method(layer, prefix)
else:
raise ValueError("moe_wna16 only support gptq and awq.")
elif isinstance(layer, FusedMoE):
return MoeWNA16Method(self)
return None
def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment