Unverified Commit 48138a84 authored by Dipika Sikka's avatar Dipika Sikka Committed by GitHub
Browse files

[BugFix] Stop silent failures on compressed-tensors parsing (#9381)

parent 343f8e09
...@@ -31,4 +31,4 @@ pyyaml ...@@ -31,4 +31,4 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.6.0 # required for compressed-tensors compressed-tensors == 0.7.1 # required for compressed-tensors
...@@ -100,12 +100,21 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -100,12 +100,21 @@ class CompressedTensorsConfig(QuantizationConfig):
target_scheme_map[target][ target_scheme_map[target][
"weights"] = QuantizationArgs.parse_obj( "weights"] = QuantizationArgs.parse_obj(
quant_config.get("weights")) quant_config.get("weights"))
try:
target_scheme_map[target]["input_activations"] = None
if is_activation_quantization_format(quant_format):
input_activations = quant_config.get("input_activations")
# The only case where we have activation quant supported
# but no input_activations provided in the config
# should be w8a16fp8 w8a16fp8 can also run for cases where
# there is an input_quant but it is ignored
if not input_activations:
assert target_scheme_map[target][
"weights"].type == QuantizationType.FLOAT
else:
target_scheme_map[target][ target_scheme_map[target][
"input_activations"] = QuantizationArgs.parse_obj( "input_activations"] = QuantizationArgs.parse_obj(
quant_config.get("input_activations")) quant_config.get("input_activations"))
except Exception:
target_scheme_map[target]["input_activations"] = None
return cls(target_scheme_map=target_scheme_map, return cls(target_scheme_map=target_scheme_map,
ignore=ignore, ignore=ignore,
...@@ -244,8 +253,6 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -244,8 +253,6 @@ class CompressedTensorsConfig(QuantizationConfig):
group_size=weight_quant.group_size, group_size=weight_quant.group_size,
actorder=weight_quant.actorder) actorder=weight_quant.actorder)
# Detect If Activation Quantization.
# TODO @dsikka: clean-up conditions
if is_activation_quantization_format(self.quant_format): if is_activation_quantization_format(self.quant_format):
if self._is_fp8_w8a8(weight_quant, input_quant): if self._is_fp8_w8a8(weight_quant, input_quant):
is_fp8_w8a8_supported = self._check_scheme_supported( is_fp8_w8a8_supported = self._check_scheme_supported(
...@@ -256,16 +263,19 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -256,16 +263,19 @@ class CompressedTensorsConfig(QuantizationConfig):
is_static_input_scheme=(input_quant is_static_input_scheme=(input_quant
and not input_quant.dynamic)) and not input_quant.dynamic))
else: else:
# note: input_quant will be present for converted models;
# will be ignored during inference post loading
return CompressedTensorsW8A16Fp8( return CompressedTensorsW8A16Fp8(
strategy=weight_quant.strategy, strategy=weight_quant.strategy,
is_static_input_scheme=(input_quant is_static_input_scheme=not input_quant.dynamic)
and not input_quant.dynamic))
# note: input_quant can be None
if self._is_fp8_w8a16(weight_quant, input_quant): if self._is_fp8_w8a16(weight_quant, input_quant):
is_static_input_scheme = (input_quant
and not input_quant.dynamic)
return CompressedTensorsW8A16Fp8( return CompressedTensorsW8A16Fp8(
strategy=weight_quant.strategy, strategy=weight_quant.strategy,
is_static_input_scheme=(input_quant is_static_input_scheme=is_static_input_scheme)
and not input_quant.dynamic))
if self._is_static_tensor_w8a8(weight_quant, input_quant): if self._is_static_tensor_w8a8(weight_quant, input_quant):
return CompressedTensorsW8A8Int8( return CompressedTensorsW8A8Int8(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment