Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
57431d82
Unverified
Commit
57431d82
authored
Mar 12, 2026
by
Michael Goin
Committed by
GitHub
Mar 12, 2026
Browse files
[UX] Only show FP4 Marlin fallback warning for w4a4 models (#36806)
Co-authored-by:
Claude
<
noreply@anthropic.com
>
parent
3e64fe4a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
14 deletions
+12
-14
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+6
-0
vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
...el_executor/layers/quantization/utils/marlin_utils_fp4.py
+0
-14
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
+6
-0
No files found.
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
57431d82
...
@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
...
@@ -324,6 +324,12 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
)
)
delattr
(
layer
,
"w2_weight_packed"
)
delattr
(
layer
,
"w2_weight_packed"
)
logger
.
warning_once
(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_moe_fp4_layer_for_marlin
(
layer
)
prepare_moe_fp4_layer_for_marlin
(
layer
)
self
.
moe_quant_config
=
self
.
get_fused_moe_quant_config
(
layer
)
self
.
moe_quant_config
=
self
.
get_fused_moe_quant_config
(
layer
)
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
View file @
57431d82
...
@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
...
@@ -147,13 +147,6 @@ def apply_fp4_marlin_linear(
def
prepare_fp4_layer_for_marlin
(
def
prepare_fp4_layer_for_marlin
(
layer
:
torch
.
nn
.
Module
,
input_dtype
:
torch
.
dtype
|
None
=
None
layer
:
torch
.
nn
.
Module
,
input_dtype
:
torch
.
dtype
|
None
=
None
)
->
None
:
)
->
None
:
logger
.
warning_once
(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4
=
hasattr
(
layer
,
"weight_global_scale"
)
is_nvfp4
=
hasattr
(
layer
,
"weight_global_scale"
)
if
input_dtype
is
not
None
and
input_dtype
.
itemsize
==
1
:
if
input_dtype
is
not
None
and
input_dtype
.
itemsize
==
1
:
if
is_nvfp4
:
if
is_nvfp4
:
...
@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
...
@@ -335,13 +328,6 @@ def prepare_nvfp4_moe_layer_for_marlin(
def
prepare_moe_fp4_layer_for_marlin
(
def
prepare_moe_fp4_layer_for_marlin
(
layer
:
torch
.
nn
.
Module
,
input_dtype
:
torch
.
dtype
|
None
=
None
layer
:
torch
.
nn
.
Module
,
input_dtype
:
torch
.
dtype
|
None
=
None
)
->
None
:
)
->
None
:
logger
.
warning_once
(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression will "
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
is_nvfp4
=
hasattr
(
layer
,
"w13_weight_scale_2"
)
is_nvfp4
=
hasattr
(
layer
,
"w13_weight_scale_2"
)
if
input_dtype
is
not
None
and
input_dtype
.
itemsize
==
1
:
if
input_dtype
is
not
None
and
input_dtype
.
itemsize
==
1
:
if
is_nvfp4
:
if
is_nvfp4
:
...
...
vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
View file @
57431d82
...
@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
...
@@ -141,6 +141,12 @@ def convert_to_nvfp4_linear_kernel_format(
layer
.
weights_padding_cols
=
0
layer
.
weights_padding_cols
=
0
if
backend
==
NvFp4LinearBackend
.
MARLIN
:
if
backend
==
NvFp4LinearBackend
.
MARLIN
:
logger
.
warning_once
(
"Your GPU does not have native support for FP4 computation but "
"FP4 quantization is being used. Weight-only FP4 compression "
"will be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
prepare_fp4_layer_for_marlin
(
layer
)
prepare_fp4_layer_for_marlin
(
layer
)
elif
backend
==
NvFp4LinearBackend
.
FLASHINFER_TRTLLM
:
elif
backend
==
NvFp4LinearBackend
.
FLASHINFER_TRTLLM
:
weight
,
weight_scale
=
prepare_weights_for_nvfp4_flashinfer_trtllm
(
weight
,
weight_scale
=
prepare_weights_for_nvfp4_flashinfer_trtllm
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment