[Bug] Fix Layer `weight_block_size` Assertion Issue (#24674)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Bug] Fix Layer `weight_block_size` Assertion Issue (#24674)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
fcba05c4 · Wentao Ye · GitHub · 7a30fa87 · fcba05c4
Unverified Commit fcba05c4 authored Sep 11, 2025 by Wentao Ye Committed by GitHub Sep 11, 2025
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/model_executor/layers/quantization/fp8.py vllm/model_executor/layers/quantization/fp8.py +3 -3

No files found.
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -450,10 +450,10 @@ class Fp8LinearMethod(LinearMethodBase):
            # Activations not quantized for marlin.
            del layer.input_scale

-        # On B200, if E8M0 for DeepGemm is used, we need to
+        # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
        # requantize the weight and input to the specific scale
        # at the same time.
-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
            assert layer.weight_block_size is not None
            block_sz = tuple(layer.weight_block_size)
            requant_weight_ue8m0_inplace(
@@ -905,7 +905,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            del layer.w13_input_scale
            del layer.w2_input_scale

-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
            assert layer.weight_block_size is not None
            # Re-quantise the expert weights so their scales are UE8M0.
            block_sz = tuple(layer.weight_block_size)