Unverified Commit ec82c3e3 authored by Wenhua Cheng's avatar Wenhua Cheng Committed by GitHub
Browse files

FIX MOE issue in AutoRound format (#18586)


Signed-off-by: default avatarwenhuach21 <wenhua.cheng@intel.com>
parent 45ab403a
...@@ -58,7 +58,7 @@ vLLM is fast with: ...@@ -58,7 +58,7 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
......
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.linear import (LinearBase, from vllm.model_executor.layers.linear import (LinearBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig) QuantizationConfig)
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
...@@ -74,7 +75,7 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -74,7 +75,7 @@ class AutoRoundConfig(QuantizationConfig):
f"group_size={self.group_size}, sym={self.sym})") f"group_size={self.group_size}, sym={self.sym})")
@classmethod @classmethod
def get_name(cls): ## use str will trigger preci issue def get_name(cls) -> QuantizationMethods:
return "auto-round" return "auto-round"
@classmethod @classmethod
...@@ -142,18 +143,18 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -142,18 +143,18 @@ class AutoRoundConfig(QuantizationConfig):
prefix, layer.__class__.__name__, weight_bits, group_size, prefix, layer.__class__.__name__, weight_bits, group_size,
sym) sym)
if backend == "auto" or "marlin" in backend: if backend == "auto" or "marlin" in backend:
if isinstance(layer, FusedMoE):
use_marlin = check_moe_marlin_supports_layer(layer, group_size)
else:
AWQ_TYPE_MAP = { AWQ_TYPE_MAP = {
4: scalar_types.uint4, 4: scalar_types.uint4,
8: scalar_types.uint8, 8: scalar_types.uint8,
} }
use_marlin = ((weight_bits, sym) in AWQ_TYPE_MAP use_marlin = (weight_bits
and check_marlin_supported( in AWQ_TYPE_MAP) and check_marlin_supported(
AWQ_TYPE_MAP[(weight_bits)], group_size, AWQ_TYPE_MAP[weight_bits], group_size, not sym)
not sym))
if isinstance(layer, FusedMoE):
use_marlin = use_marlin and check_moe_marlin_supports_layer(
layer, group_size)
else: else:
use_marlin = False use_marlin = False
if use_marlin: if use_marlin:
...@@ -180,10 +181,11 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -180,10 +181,11 @@ class AutoRoundConfig(QuantizationConfig):
from vllm.model_executor.layers.quantization.moe_wna16 import ( from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config) MoeWNA16Config)
config = { config = {
"linear_quant_method": "awq", "quant_method": "awq",
"weight_bits": weight_bits, "bits": weight_bits,
"group_size": group_size, "group_size": group_size,
"zero_point": not sym, "zero_point": not sym,
"lm_head": False,
} }
return MoeWNA16Config.from_config(config).get_quant_method( return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix) layer, prefix)
...@@ -213,9 +215,6 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -213,9 +215,6 @@ class AutoRoundConfig(QuantizationConfig):
prefix, layer.__class__.__name__, weight_bits, group_size, prefix, layer.__class__.__name__, weight_bits, group_size,
sym) sym)
if backend == "auto" or "marlin" in backend: if backend == "auto" or "marlin" in backend:
if isinstance(layer, FusedMoE):
use_marlin = check_moe_marlin_supports_layer(layer, group_size)
else:
GPTQ_TYPE_MAP = { GPTQ_TYPE_MAP = {
(4, True): scalar_types.uint4b8, (4, True): scalar_types.uint4b8,
(8, True): scalar_types.uint8b128, (8, True): scalar_types.uint8b128,
...@@ -225,6 +224,9 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -225,6 +224,9 @@ class AutoRoundConfig(QuantizationConfig):
GPTQ_TYPE_MAP[(weight_bits, sym)], GPTQ_TYPE_MAP[(weight_bits, sym)],
group_size, group_size,
has_zp=not sym)) has_zp=not sym))
if isinstance(layer, FusedMoE):
use_marlin = use_marlin and check_moe_marlin_supports_layer(
layer, group_size)
else: else:
use_marlin = False use_marlin = False
if use_marlin: if use_marlin:
...@@ -251,11 +253,11 @@ class AutoRoundConfig(QuantizationConfig): ...@@ -251,11 +253,11 @@ class AutoRoundConfig(QuantizationConfig):
from vllm.model_executor.layers.quantization.moe_wna16 import ( from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config) MoeWNA16Config)
config = { config = {
"linear_quant_method": "gptq", "quant_method": "gptq",
"weight_bits": weight_bits, "bits": weight_bits,
"group_size": group_size, "group_size": group_size,
"sym": sym, "sym": sym,
"lm_head_quantized": False, "lm_head": False,
} }
return MoeWNA16Config.from_config(config).get_quant_method( return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix) layer, prefix)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment