[Misc] Add compressed-tensors to optimized quant list (#7006)

a0dce938 · Michael Goin · GitHub · 35e9c12b · a0dce938
Unverified Commit a0dce938 authored Jul 31, 2024 by Michael Goin Committed by GitHub Jul 31, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

vllm/config.py vllm/config.py +6 -4

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -197,13 +197,17 @@ class ModelConfig:
    def _parse_quant_hf_config(self):
        quant_cfg = getattr(self.hf_config, "quantization_config", None)
        if quant_cfg is None:
-            # compress-tensors uses a "compression_config" key
+            # compressed-tensors uses a "compression_config" key
            quant_cfg = getattr(self.hf_config, "compression_config", None)
        return quant_cfg
    def _verify_quantization(self) -> None:
        supported_quantization = [*QUANTIZATION_METHODS]
        rocm_supported_quantization = ["gptq", "squeezellm"]
+        optimized_quantization_methods = [
+            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
+            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"
+        ]
        if self.quantization is not None:
            self.quantization = self.quantization.lower()
@@ -242,9 +246,7 @@ class ModelConfig:
                raise ValueError(
                    f"{self.quantization} quantization is currently not "
                    f"supported in ROCm.")
-            if (self.quantization
+            if self.quantization not in optimized_quantization_methods:
-                    not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin",
-                            "awq_marlin", "fbgemm_fp8", "compressed_tensors")):
                logger.warning(
                    "%s quantization is not fully "
                    "optimized yet. The speed can be slower than "