Commit ca4ec0ce authored by lizhigong's avatar lizhigong
Browse files

Merge remote-tracking branch 'origin/v0.7.2-dev' into v0.7.2_zero_overhead

parents 0be169ad ae0ed592
...@@ -364,6 +364,9 @@ class FusedMoE(torch.nn.Module): ...@@ -364,6 +364,9 @@ class FusedMoE(torch.nn.Module):
"CompressedTensorsWNA16MoEMethod"): "CompressedTensorsWNA16MoEMethod"):
moe_quant_params["intermediate_size_full"] = intermediate_size moe_quant_params["intermediate_size_full"] = intermediate_size
if (self.quant_method.__class__.__name__ in ("BlockInt8MoEMethod")):
moe_quant_params["intermediate_size"] = self.intermediate_size_per_partition
self.quant_method.create_weights(layer=self, **moe_quant_params) self.quant_method.create_weights(layer=self, **moe_quant_params)
def _load_per_tensor_weight_scale(self, shard_id: str, def _load_per_tensor_weight_scale(self, shard_id: str,
......
...@@ -37,7 +37,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [ ...@@ -37,7 +37,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
"MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
"TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
"ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
"HQQMarlinMethod", "QuarkLinearMethod" "HQQMarlinMethod", "QuarkLinearMethod", "BlockInt8LinearMethod",
] ]
...@@ -664,9 +664,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -664,9 +664,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
if isinstance(param, BlockQuantScaleParameter): if isinstance(param, BlockQuantScaleParameter):
from vllm.model_executor.layers.quantization.fp8 import ( from vllm.model_executor.layers.quantization.fp8 import (
Fp8LinearMethod, Fp8MoEMethod) Fp8LinearMethod, Fp8MoEMethod)
from vllm.model_executor.layers.quantization.blockwise_int8 import (
BlockInt8LinearMethod, BlockInt8MoEMethod)
assert self.quant_method is not None assert self.quant_method is not None
assert isinstance(self.quant_method, assert isinstance(self.quant_method,
(Fp8LinearMethod, Fp8MoEMethod)) (Fp8LinearMethod, Fp8MoEMethod, BlockInt8LinearMethod, BlockInt8MoEMethod))
weight_block_size = self.quant_method.quant_config.weight_block_size weight_block_size = self.quant_method.quant_config.weight_block_size
assert weight_block_size is not None assert weight_block_size is not None
block_n, _ = weight_block_size[0], weight_block_size[1] block_n, _ = weight_block_size[0], weight_block_size[1]
......
...@@ -29,7 +29,8 @@ QUANTIZATION_METHODS: List[str] = [ ...@@ -29,7 +29,8 @@ QUANTIZATION_METHODS: List[str] = [
"neuron_quant", "neuron_quant",
"ipex", "ipex",
"quark", "quark",
"moe_wna16" "moe_wna16",
"blockwise_int8"
] ]
# The customized quantization methods which will be added to this dict. # The customized quantization methods which will be added to this dict.
...@@ -101,6 +102,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: ...@@ -101,6 +102,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
from .neuron_quant import NeuronQuantConfig from .neuron_quant import NeuronQuantConfig
from .qqq import QQQConfig from .qqq import QQQConfig
from .tpu_int8 import Int8TpuConfig from .tpu_int8 import Int8TpuConfig
from .blockwise_int8 import BlockInt8Config
method_to_config: Dict[str, Type[QuantizationConfig]] = { method_to_config: Dict[str, Type[QuantizationConfig]] = {
"aqlm": AQLMConfig, "aqlm": AQLMConfig,
...@@ -127,6 +129,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: ...@@ -127,6 +129,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
"ipex": IPEXConfig, "ipex": IPEXConfig,
"quark": QuarkConfig, "quark": QuarkConfig,
"moe_wna16": MoeWNA16Config, "moe_wna16": MoeWNA16Config,
"blockwise_int8": BlockInt8Config,
} }
# Update the `method_to_config` with customized quantization methods. # Update the `method_to_config` with customized quantization methods.
method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment