Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import os import os
from typing import Dict, List, Optional, Type from typing import Dict, List, Optional, Type
......
# SPDX-License-Identifier: Apache-2.0
from typing import Optional, Tuple from typing import Optional, Tuple
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from typing import Optional, Tuple from typing import Optional, Tuple
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
import warnings import warnings
from typing import Optional, Tuple from typing import Optional, Tuple
......
# SPDX-License-Identifier: Apache-2.0
import torch import torch
from vllm.logger import init_logger from vllm.logger import init_logger
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Callable, Dict, List, Optional from typing import Any, Callable, Dict, List, Optional
import torch import torch
...@@ -5,17 +7,15 @@ import torch ...@@ -5,17 +7,15 @@ import torch
from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
from vllm.model_executor.layers.fused_moe.layer import ( from vllm.model_executor.layers.fused_moe.layer import (
FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.linear import (LinearBase,
from vllm.model_executor.layers.quantization.awq import (AWQConfig, UnquantizedLinearMethod)
AWQLinearMethod) from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.awq_marlin import ( from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
AWQMarlinConfig, AWQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase) QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.gptq import (GPTQConfig, from vllm.model_executor.layers.quantization.gptq import GPTQConfig
GPTQLinearMethod)
from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinConfig, GPTQMarlinLinearMethod) GPTQMarlinConfig)
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -126,25 +126,26 @@ class MoeWNA16Config(QuantizationConfig): ...@@ -126,25 +126,26 @@ class MoeWNA16Config(QuantizationConfig):
prefix: str) -> Optional["QuantizeMethodBase"]: prefix: str) -> Optional["QuantizeMethodBase"]:
if is_layer_skipped_quant(prefix, self.modules_to_not_convert): if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
return UnquantizedLinearMethod() return UnquantizedLinearMethod()
elif isinstance(layer, FusedMoE): elif isinstance(layer, LinearBase):
return MoeWNA16Method(self)
else:
if self.linear_quant_method == "gptq": if self.linear_quant_method == "gptq":
if self.use_marlin: if self.use_marlin:
return GPTQMarlinLinearMethod( return GPTQMarlinConfig.from_config(
GPTQMarlinConfig.from_config(self.full_config)) self.full_config).get_quant_method(layer, prefix)
else: else:
return GPTQLinearMethod( return GPTQConfig.from_config(
GPTQConfig.from_config(self.full_config)) self.full_config).get_quant_method(layer, prefix)
elif self.linear_quant_method == "awq": elif self.linear_quant_method == "awq":
if self.use_marlin: if self.use_marlin:
return AWQMarlinLinearMethod( return AWQMarlinConfig.from_config(
AWQMarlinConfig.from_config(self.full_config)) self.full_config).get_quant_method(layer, prefix)
else: else:
return AWQLinearMethod( return AWQConfig.from_config(
AWQConfig.from_config(self.full_config)) self.full_config).get_quant_method(layer, prefix)
else: else:
raise ValueError("moe_wna16 only support gptq and awq.") raise ValueError("moe_wna16 only support gptq and awq.")
elif isinstance(layer, FusedMoE):
return MoeWNA16Method(self)
return None
def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
......
# SPDX-License-Identifier: Apache-2.0
import os import os
from importlib.util import find_spec from importlib.util import find_spec
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
import fnmatch import fnmatch
import re import re
from typing import Any, Dict, List, Optional, cast from typing import Any, Dict, List, Optional, cast
...@@ -16,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import ( ...@@ -16,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import (
QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8) QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8)
from vllm.model_executor.layers.quantization.quark.utils import ( from vllm.model_executor.layers.quantization.quark.utils import (
deep_compare, should_ignore_layer) deep_compare, should_ignore_layer)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
FUSED_LAYER_NAME_MAPPING)
from vllm.platforms import current_platform from vllm.platforms import current_platform
__all__ = ["QuarkLinearMethod"] __all__ = ["QuarkLinearMethod"]
...@@ -56,7 +56,9 @@ class QuarkConfig(QuantizationConfig): ...@@ -56,7 +56,9 @@ class QuarkConfig(QuantizationConfig):
# Check if the layer is skipped for quantization. # Check if the layer is skipped for quantization.
exclude_layers = cast(List[str], self.quant_config.get("exclude")) exclude_layers = cast(List[str], self.quant_config.get("exclude"))
if should_ignore_layer(prefix, ignore=exclude_layers): if should_ignore_layer(prefix,
ignore=exclude_layers,
fused_mapping=self.packed_modules_mapping):
return UnquantizedLinearMethod() return UnquantizedLinearMethod()
if isinstance(layer, LinearBase): if isinstance(layer, LinearBase):
scheme = self.get_scheme(layer=layer, layer_name=prefix) scheme = self.get_scheme(layer=layer, layer_name=prefix)
...@@ -199,8 +201,8 @@ class QuarkConfig(QuantizationConfig): ...@@ -199,8 +201,8 @@ class QuarkConfig(QuantizationConfig):
module: torch.nn.Module) -> Dict[str, Any]: module: torch.nn.Module) -> Dict[str, Any]:
proj_name = layer_name.split(".")[-1] proj_name = layer_name.split(".")[-1]
if proj_name in FUSED_LAYER_NAME_MAPPING: if proj_name in self.packed_modules_mapping:
shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] shard_proj_names = self.packed_modules_mapping[proj_name]
# Convert fused_name --> [shard_names] # Convert fused_name --> [shard_names]
shard_names = [ shard_names = [
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Callable, Dict, Optional from typing import Any, Callable, Dict, Optional
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from .quark_scheme import QuarkScheme from .quark_scheme import QuarkScheme
from .quark_w8a8_fp8 import QuarkW8A8Fp8 from .quark_w8a8_fp8 import QuarkW8A8Fp8
from .quark_w8a8_int8 import QuarkW8A8Int8 from .quark_w8a8_int8 import QuarkW8A8Int8
......
# SPDX-License-Identifier: Apache-2.0
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional from typing import Optional
......
# SPDX-License-Identifier: Apache-2.0
from typing import Callable, List, Optional from typing import Callable, List, Optional
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from typing import Callable, List, Optional, Set from typing import Callable, List, Optional, Set
import torch import torch
......
import re # SPDX-License-Identifier: Apache-2.0
from typing import Any, Iterable, Optional
from vllm.model_executor.layers.quantization.utils.quant_utils import ( import re
FUSED_LAYER_NAME_MAPPING) from types import MappingProxyType
from typing import Any, Iterable, List, Mapping, Optional
def deep_compare(dict1: Any, dict2: Any) -> bool: def deep_compare(dict1: Any, dict2: Any) -> bool:
...@@ -18,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool: ...@@ -18,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
return dict1 == dict2 return dict1 == dict2
def should_ignore_layer(layer_name: Optional[str], def should_ignore_layer(
ignore: Iterable[str]) -> bool: layer_name: Optional[str],
ignore: Iterable[str],
fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
) -> bool:
if layer_name is None: if layer_name is None:
return False return False
...@@ -31,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str], ...@@ -31,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str],
# in the safetensors checkpoint. So, we convert the name # in the safetensors checkpoint. So, we convert the name
# from the fused version to unfused + check to make sure that # from the fused version to unfused + check to make sure that
# each shard of the fused layer has the same scheme. # each shard of the fused layer has the same scheme.
if proj_name in FUSED_LAYER_NAME_MAPPING: if proj_name in fused_mapping:
shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] shard_proj_names = fused_mapping[proj_name]
# Convert fused_name --> [shard_names] # Convert fused_name --> [shard_names]
shard_names = [ shard_names = [
......
# SPDX-License-Identifier: Apache-2.0
""" """
This file contains the Pydantic schemas for various quantization-related This file contains the Pydantic schemas for various quantization-related
parameters. When a relevant quantization technique is specified, these parameters. When a relevant quantization technique is specified, these
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
from .layer_utils import replace_parameter, update_tensor_inplace from .layer_utils import replace_parameter, update_tensor_inplace
__all__ = ['update_tensor_inplace', 'replace_parameter'] __all__ = ['update_tensor_inplace', 'replace_parameter']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment