Unverified Commit 67532a1a authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[UX] Remove "quantization is not fully optimized yet" log (#25012)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 5672ba90
......@@ -1086,22 +1086,6 @@ class ModelConfig:
def _verify_quantization(self) -> None:
supported_quantization = me_quant.QUANTIZATION_METHODS
optimized_quantization_methods = [
"fp8",
"modelopt",
"gptq_marlin_24",
"gptq_marlin",
"awq_marlin",
"fbgemm_fp8",
"compressed-tensors",
"experts_int8",
"quark",
"modelopt_fp4",
"bitblas",
"gptq_bitblas",
"inc",
"petit_nvfp4",
]
if self.quantization is not None:
self.quantization = cast(me_quant.QuantizationMethods,
self.quantization)
......@@ -1183,11 +1167,6 @@ class ModelConfig:
f"be one of {supported_quantization}.")
from vllm.platforms import current_platform
current_platform.verify_quantization(self.quantization)
if self.quantization not in optimized_quantization_methods:
logger.warning(
"%s quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models.", self.quantization)
def _verify_cuda_graph(self) -> None:
# The `max_seq_len_to_capture` was incorrectly
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment