Merge remote-tracking branch 'origin/v0.7.2-dev' into v0.7.2_zero_overhead

ca4ec0ce · lizhigong · 0be169ad · ae0ed592 · ca4ec0ce · ca4ec0ce
Commit ca4ec0ce authored Mar 25, 2025 by lizhigong
20 changed files
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -364,6 +364,9 @@ class FusedMoE(torch.nn.Module):
                "CompressedTensorsWNA16MoEMethod"):
            moe_quant_params["intermediate_size_full"] = intermediate_size
+        if (self.quant_method.__class__.__name__ in ("BlockInt8MoEMethod")):
+            moe_quant_params["intermediate_size"] = self.intermediate_size_per_partition
        self.quant_method.create_weights(layer=self, **moe_quant_params)
    def _load_per_tensor_weight_scale(self, shard_id: str,

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -37,7 +37,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
    "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
-    "HQQMarlinMethod", "QuarkLinearMethod"
+    "HQQMarlinMethod", "QuarkLinearMethod", "BlockInt8LinearMethod",
 ]
@@ -664,9 +664,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        if isinstance(param, BlockQuantScaleParameter):
            from vllm.model_executor.layers.quantization.fp8 import (
                Fp8LinearMethod, Fp8MoEMethod)
+            from vllm.model_executor.layers.quantization.blockwise_int8 import (
+                BlockInt8LinearMethod, BlockInt8MoEMethod)
            assert self.quant_method is not None
            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
+                              (Fp8LinearMethod, Fp8MoEMethod, BlockInt8LinearMethod, BlockInt8MoEMethod))
            weight_block_size = self.quant_method.quant_config.weight_block_size
            assert weight_block_size is not None
            block_n, _ = weight_block_size[0], weight_block_size[1]

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -29,7 +29,8 @@ QUANTIZATION_METHODS: List[str] = [
    "neuron_quant",
    "ipex",
    "quark",
-    "moe_wna16"
+    "moe_wna16",
+    "blockwise_int8"
 ]
 # The customized quantization methods which will be added to this dict.
@@ -101,6 +102,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
    from .neuron_quant import NeuronQuantConfig
    from .qqq import QQQConfig
    from .tpu_int8 import Int8TpuConfig
+    from .blockwise_int8 import BlockInt8Config
    method_to_config: Dict[str, Type[QuantizationConfig]] = {
        "aqlm": AQLMConfig,
@@ -127,6 +129,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
        "ipex": IPEXConfig,
        "quark": QuarkConfig,
        "moe_wna16": MoeWNA16Config,
+        "blockwise_int8": BlockInt8Config,
    }
    # Update the `method_to_config` with customized quantization methods.
    method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)

--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_12288_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_12288_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_1280_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_1280_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_13824_5120_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_13824_5120_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_14336_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_14336_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_15360_5120_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_15360_5120_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_22016_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_22016_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_2560_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_2560_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_27648_5120_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_27648_5120_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_28672_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_28672_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_28672_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_28672_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_32000_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_32000_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_3584_18944_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_3584_18944_K100_AI.json