Commit e5f51b79 authored by yangql's avatar yangql
Browse files

新增dpsk-v3.1-awq的支持

parent 5c288d91
......@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig):
return "awq"
def get_supported_act_dtypes(self) -> list[torch.dtype]:
return [torch.half]
return [torch.half, torch.bfloat16]
@classmethod
def get_min_capability(cls) -> int:
......@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase):
pad_group=2
dim_n = layer.scales.data.shape[1]
dim_k = layer.qweight.data.shape[0]
_qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales,int(group_size))
_qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales.to(torch.float16),int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
sz = sz.reshape(dim_n,-1)
_qw = _qw.reshape(dim_n,-1)
......
......@@ -10,7 +10,8 @@ import vllm.model_executor.layers.fused_moe # noqa
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
UnquantizedFusedMoEMethod)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod,
set_weight_attrs)
......@@ -140,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig):
self.full_config).get_quant_method(layer, prefix)
return AWQMarlinLinearMethod(self)
elif isinstance(layer, FusedMoE):
if is_layer_skipped_awq(
prefix, getattr(self, "modules_to_not_convert", [])):
return UnquantizedFusedMoEMethod(layer.moe_config)
from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config)
if not check_moe_marlin_supports_layer(layer, self.group_size):
......@@ -436,7 +440,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
# Why does this take the intermediate size for size_k?
marlin_w13_scales = marlin_moe_permute_scales(
s=layer.w13_scales,
s=layer.w13_scales.to(torch.float16),
size_k=layer.intermediate_size_per_partition,
size_n=layer.w13_scales.shape[2],
group_size=self.quant_config.group_size,
......@@ -445,7 +449,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
#replace_parameter(layer, "w13_scales", marlin_w13_scales)
marlin_w2_scales = marlin_moe_permute_scales(
s=layer.w2_scales,
s=layer.w2_scales.to(torch.float16),
size_k=layer.intermediate_size_per_partition,
size_n=layer.w2_scales.shape[2],
group_size=self.quant_config.group_size,
......
......@@ -7,7 +7,8 @@ import torch
import os
from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
UnquantizedFusedMoEMethod)
from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
......@@ -18,7 +19,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.awq import (
is_layer_skipped_awq)
from lmslim.layers.fused_moe.fuse_moe_int4 import fused_experts_w4a16
os.environ['W4A16_MOE_CUDA'] = os.environ.get('W4A16_MOE_CUDA', '0')
......@@ -139,9 +141,9 @@ class MoeWNA16Config(QuantizationConfig):
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
return UnquantizedLinearMethod()
elif isinstance(layer, LinearBase):
if isinstance(layer, LinearBase):
if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
return UnquantizedLinearMethod()
# Avoid circular import
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.awq_marlin import (
......@@ -167,6 +169,9 @@ class MoeWNA16Config(QuantizationConfig):
else:
raise ValueError("moe_wna16 only support gptq and awq.")
elif isinstance(layer, FusedMoE):
if is_layer_skipped_awq(
prefix, getattr(self, "modules_to_not_convert", [])):
return UnquantizedFusedMoEMethod(layer.moe_config)
return MoeWNA16Method(self)
return None
......
......@@ -176,15 +176,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
supports_router_weight = not layer.apply_router_weight_on_input
# moe marlin requires the activation to be silu
supports_activation = layer.activation == "silu"
#暂时只支持bw
device_name = torch.cuda.get_device_properties(torch.cuda.current_device()).name
supports_device = "BW" in device_name
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# moe marlin requires n % 128 == 0 and k % 64 == 0
supports_shape = hidden_size % 128 == 0 and \
intermediate_size_per_partition % max(64, group_size) == 0
supports_group_size = group_size in [-1, 32, 64, 128]
#暂时只支持64
supports_group_size = group_size in [64]
return supports_shape and supports_group_size and \
supports_router_weight and supports_activation
supports_router_weight and supports_activation and supports_device
def marlin_make_workspace(output_size_per_partition: int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment