Commit 2cfcb974 authored by zhuwenwen's avatar zhuwenwen
Browse files

新增dpsk-v3.1-awq的支持

parent a02a1c83
...@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig): ...@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig):
return "awq" return "awq"
def get_supported_act_dtypes(self) -> list[torch.dtype]: def get_supported_act_dtypes(self) -> list[torch.dtype]:
return [torch.half] return [torch.half, torch.bfloat16]
@classmethod @classmethod
def get_min_capability(cls) -> int: def get_min_capability(cls) -> int:
...@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase):
pad_group=2 pad_group=2
dim_n = layer.scales.data.shape[1] dim_n = layer.scales.data.shape[1]
dim_k = layer.qweight.data.shape[0] dim_k = layer.qweight.data.shape[0]
_qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales,int(group_size)) _qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales.to(torch.float16),int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n) sz = ops.sz_permute(_sz).reshape(-1,dim_n)
sz = sz.reshape(dim_n,-1) sz = sz.reshape(dim_n,-1)
_qw = _qw.reshape(dim_n,-1) _qw = _qw.reshape(dim_n,-1)
......
...@@ -141,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig): ...@@ -141,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig):
self.full_config).get_quant_method(layer, prefix) self.full_config).get_quant_method(layer, prefix)
return AWQMarlinLinearMethod(self) return AWQMarlinLinearMethod(self)
elif isinstance(layer, FusedMoE): elif isinstance(layer, FusedMoE):
if is_layer_skipped_awq(
prefix, getattr(self, "modules_to_not_convert", [])):
return UnquantizedFusedMoEMethod(layer.moe_config)
from vllm.model_executor.layers.quantization.moe_wna16 import ( from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config) MoeWNA16Config)
if is_layer_skipped_awq( if is_layer_skipped_awq(
...@@ -448,7 +451,7 @@ class AWQMoEMethod(FusedMoEMethodBase): ...@@ -448,7 +451,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
# Why does this take the intermediate size for size_k? # Why does this take the intermediate size for size_k?
marlin_w13_scales = marlin_moe_permute_scales( marlin_w13_scales = marlin_moe_permute_scales(
s=layer.w13_scales, s=layer.w13_scales.to(torch.float16),
size_k=layer.intermediate_size_per_partition, size_k=layer.intermediate_size_per_partition,
size_n=layer.w13_scales.shape[2], size_n=layer.w13_scales.shape[2],
group_size=self.quant_config.group_size, group_size=self.quant_config.group_size,
...@@ -457,7 +460,7 @@ class AWQMoEMethod(FusedMoEMethodBase): ...@@ -457,7 +460,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
#replace_parameter(layer, "w13_scales", marlin_w13_scales) #replace_parameter(layer, "w13_scales", marlin_w13_scales)
marlin_w2_scales = marlin_moe_permute_scales( marlin_w2_scales = marlin_moe_permute_scales(
s=layer.w2_scales, s=layer.w2_scales.to(torch.float16),
size_k=layer.intermediate_size_per_partition, size_k=layer.intermediate_size_per_partition,
size_n=layer.w2_scales.shape[2], size_n=layer.w2_scales.shape[2],
group_size=self.quant_config.group_size, group_size=self.quant_config.group_size,
......
...@@ -7,7 +7,8 @@ import torch ...@@ -7,7 +7,8 @@ import torch
import os import os
from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
from vllm.model_executor.layers.fused_moe.layer import ( from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported) FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
UnquantizedFusedMoEMethod)
from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization import QuantizationMethods
...@@ -19,6 +20,8 @@ from vllm.model_executor.utils import set_weight_attrs ...@@ -19,6 +20,8 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.awq import (
is_layer_skipped_awq)
from lmslim.layers.fused_moe.fuse_moe_int4 import fused_experts_w4a16 from lmslim.layers.fused_moe.fuse_moe_int4 import fused_experts_w4a16
os.environ['W4A16_MOE_CUDA'] = os.environ.get('W4A16_MOE_CUDA', '0') os.environ['W4A16_MOE_CUDA'] = os.environ.get('W4A16_MOE_CUDA', '0')
...@@ -139,9 +142,9 @@ class MoeWNA16Config(QuantizationConfig): ...@@ -139,9 +142,9 @@ class MoeWNA16Config(QuantizationConfig):
def get_quant_method(self, layer: torch.nn.Module, def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]: prefix: str) -> Optional["QuantizeMethodBase"]:
if is_layer_skipped_quant(prefix, self.modules_to_not_convert): if isinstance(layer, LinearBase):
return UnquantizedLinearMethod() if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
elif isinstance(layer, LinearBase): return UnquantizedLinearMethod()
# Avoid circular import # Avoid circular import
from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.awq_marlin import ( from vllm.model_executor.layers.quantization.awq_marlin import (
...@@ -167,6 +170,9 @@ class MoeWNA16Config(QuantizationConfig): ...@@ -167,6 +170,9 @@ class MoeWNA16Config(QuantizationConfig):
else: else:
raise ValueError("moe_wna16 only support gptq and awq.") raise ValueError("moe_wna16 only support gptq and awq.")
elif isinstance(layer, FusedMoE): elif isinstance(layer, FusedMoE):
if is_layer_skipped_awq(
prefix, getattr(self, "modules_to_not_convert", [])):
return UnquantizedFusedMoEMethod(layer.moe_config)
return MoeWNA16Method(self, layer.moe_config) return MoeWNA16Method(self, layer.moe_config)
return None return None
......
...@@ -177,14 +177,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \ ...@@ -177,14 +177,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
# moe marlin requires the activation to be silu # moe marlin requires the activation to be silu
supports_activation = layer.activation == "silu" supports_activation = layer.activation == "silu"
#暂时只支持bw
device_name = torch.cuda.get_device_properties(torch.cuda.current_device()).name
supports_device = "BW" in device_name
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size) # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# down: (n, k) = (hidden_size, intermediate_size_per_partition) # down: (n, k) = (hidden_size, intermediate_size_per_partition)
# moe marlin requires n % 128 == 0 and k % 64 == 0 # moe marlin requires n % 128 == 0 and k % 64 == 0
supports_shape = hidden_size % 128 == 0 and \ supports_shape = hidden_size % 128 == 0 and \
intermediate_size_per_partition % max(64, group_size) == 0 intermediate_size_per_partition % max(64, group_size) == 0
supports_group_size = group_size in [-1, 32, 64, 128]
#暂时只支持64
supports_group_size = group_size in [64]
return supports_shape and supports_group_size and \ return supports_shape and supports_group_size and \
supports_router_weight and supports_activation supports_router_weight and supports_activation and supports_device
def marlin_make_workspace(output_size_per_partition: int, def marlin_make_workspace(output_size_per_partition: int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment