Unverified Commit 57431d82 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[UX] Only show FP4 Marlin fallback warning for w4a4 models (#36806)


Co-authored-by: default avatarClaude <noreply@anthropic.com>
parent 3e64fe4a
......@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
)
delattr(layer, "w2_weight_packed")
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_moe_fp4_layer_for_marlin(layer)
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
......
......@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
def prepare_fp4_layer_for_marlin(
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
) -> None:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4 = hasattr(layer, "weight_global_scale")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
......@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
def prepare_moe_fp4_layer_for_marlin(
layer: torch.nn.Module, input_dtype: torch.dtype | None = None
) -> None:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
......
......@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
layer.weights_padding_cols = 0
if backend == NvFp4LinearBackend.MARLIN:
logger.warning_once(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_fp4_layer_for_marlin(layer)
elif backend == NvFp4LinearBackend.FLASHINFER_TRTLLM:
weight, weight_scale = prepare_weights_for_nvfp4_flashinfer_trtllm(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment