Unverified Commit daa05bf3 authored by EdalatiAli's avatar EdalatiAli Committed by GitHub
Browse files

[Bugfix] Fix AttributeError when serving MXFP8 models with DeepGEMM installed (#37358)


Signed-off-by: default avatarEdalatiAli <aliedalati@cohere.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 7769b583
...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( ...@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
) )
from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.linear import LinearBase
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
from vllm.model_executor.layers.quantization.mxfp8 import Mxfp8OnlineLinearMethod
from vllm.tracing import instrument from vllm.tracing import instrument
from vllm.utils.deep_gemm import ( from vllm.utils.deep_gemm import (
fp8_gemm_nt, fp8_gemm_nt,
...@@ -136,8 +137,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool: ...@@ -136,8 +137,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
if not ( if not (
isinstance(module, LinearBase) isinstance(module, LinearBase)
and isinstance(module.quant_method, Fp8LinearMethod) and isinstance(module.quant_method, Fp8LinearMethod)
and module.quant_method.block_quant and not isinstance(module.quant_method, Mxfp8OnlineLinearMethod)
and not module.quant_method.use_marlin and getattr(module.quant_method, "block_quant", False)
and not getattr(module.quant_method, "use_marlin", True)
): ):
return False return False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment