"vscode:/vscode.git/clone" did not exist on "265ecb05fb807b3bfbf565380a00853a846b93ab"
Unverified Commit e8b055a5 authored by Matthias Gehre's avatar Matthias Gehre Committed by GitHub
Browse files

[Bugfix] Handle ParallelLMHead in compressed-tensors get_quant_method (#37291)


Signed-off-by: default avatarMatthias Gehre <matthias.gehre@amd.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
parent 246dc7d8
...@@ -5,13 +5,20 @@ ...@@ -5,13 +5,20 @@
Run `pytest tests/quantization/test_compressed_tensors.py`. Run `pytest tests/quantization/test_compressed_tensors.py`.
""" """
from unittest.mock import Mock
import pytest import pytest
import torch import torch
from compressed_tensors.quantization import QuantizationType from compressed_tensors.quantization import (
QuantizationArgs,
QuantizationStrategy,
QuantizationType,
)
from tests.models.utils import check_logprobs_close from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsConfig,
CompressedTensorsLinearMethod, CompressedTensorsLinearMethod,
CompressedTensorsW4A4Fp4, CompressedTensorsW4A4Fp4,
CompressedTensorsW4A8Fp8, CompressedTensorsW4A8Fp8,
...@@ -26,6 +33,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8 ...@@ -26,6 +33,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8
from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
cutlass_fp4_supported, cutlass_fp4_supported,
) )
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.attention.backends.fa_utils import get_flash_attn_version from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
...@@ -558,3 +566,72 @@ def test_w4a16_moe_torch_compile(vllm_runner): ...@@ -558,3 +566,72 @@ def test_w4a16_moe_torch_compile(vllm_runner):
) as llm: ) as llm:
output = llm.generate_greedy("Hi", max_tokens=1) output = llm.generate_greedy("Hi", max_tokens=1)
assert output assert output
def _make_ct_config(*, target: str = "Linear") -> CompressedTensorsConfig:
"""Build a minimal CompressedTensorsConfig with INT8 channel quant."""
weight_quant = QuantizationArgs(
num_bits=8,
type=QuantizationType.INT,
strategy=QuantizationStrategy.CHANNEL,
symmetric=True,
dynamic=False,
)
return CompressedTensorsConfig(
target_scheme_map={
target: {
"weights": weight_quant,
"input_activations": None,
"format": "pack-quantized",
}
},
ignore=[],
quant_format="pack-quantized",
sparsity_scheme_map={},
sparsity_ignore_list=[],
)
def test_get_quant_method_returns_linear_method_for_parallel_lm_head():
"""ParallelLMHead whose name matches a target must get a quantised method."""
config = _make_ct_config(target="re:.*lm_head")
mock_lm_head = Mock(spec=ParallelLMHead)
mock_lm_head.__class__ = ParallelLMHead
method = config.get_quant_method(mock_lm_head, prefix="model.lm_head")
assert isinstance(method, CompressedTensorsLinearMethod), (
f"Expected CompressedTensorsLinearMethod, got {type(method).__name__}"
)
def test_get_quant_method_returns_none_for_ignored_parallel_lm_head():
"""ParallelLMHead on the ignore list should be left unquantized (None)."""
config = _make_ct_config(target="re:.*lm_head")
config.ignore = ["re:.*lm_head"]
mock_lm_head = Mock(spec=ParallelLMHead)
mock_lm_head.__class__ = ParallelLMHead
method = config.get_quant_method(mock_lm_head, prefix="model.lm_head")
assert method is None, (
f"Expected None for ignored ParallelLMHead, got {type(method).__name__}"
)
def test_get_quant_method_returns_none_for_unmatched_parallel_lm_head():
"""ParallelLMHead with target='Linear' (typical real model) must not crash.
Most compressed-tensors models only target 'Linear'. ParallelLMHead does
not match that target, so get_quant_method should return None (unquantized)
instead of raising ValueError.
"""
config = _make_ct_config(target="Linear")
mock_lm_head = Mock(spec=ParallelLMHead)
mock_lm_head.__class__ = ParallelLMHead
method = config.get_quant_method(mock_lm_head, prefix="model.lm_head")
assert method is None, (
f"Expected None for unmatched ParallelLMHead, got {type(method).__name__}"
)
...@@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( ...@@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
should_ignore_layer, should_ignore_layer,
) )
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.platforms import current_platform from vllm.platforms import current_platform
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -179,6 +180,15 @@ class CompressedTensorsConfig(QuantizationConfig): ...@@ -179,6 +180,15 @@ class CompressedTensorsConfig(QuantizationConfig):
else: else:
return quant_method return quant_method
if isinstance(layer, ParallelLMHead):
try:
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
except ValueError:
quant_scheme = None
if quant_scheme is not None:
layer.scheme = quant_scheme
return CompressedTensorsLinearMethod(self)
if isinstance(layer, Attention): if isinstance(layer, Attention):
return CompressedTensorsKVCacheMethod(self) return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, FusedMoE): if isinstance(layer, FusedMoE):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment