[ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)

Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>

[ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
d7963752 · XiaobingZhang · GitHub · 14f84563 · d7963752
Unverified Commit d7963752 authored Oct 16, 2025 by XiaobingZhang Committed by GitHub Oct 15, 2025
Show whitespace changes
Inline Side-by-side

Showing with 0 additions and 12 deletions

vllm/model_executor/layers/quantization/modelopt.py vllm/model_executor/layers/quantization/modelopt.py +0 -12

No files found.
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1542,23 +1542,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
            del layer.w2_input_scale_quant
        else:
            # Non-TRT-LLM processing (Cutlass or non-flashinfer)
-            assert layer.w13_weight_scale.shape[2] % 16 == 0, (
-                "Expected weight_scale.dim(1) to be divisible by 16"
-            )
-            assert layer.w13_weight_scale.dtype == torch.float8_e4m3fn, (
-                "Weight Blockscale must be represented as FP8-E4M3"
-            )
            w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
            layer.w13_weight_scale = Parameter(
                w13_blockscale_swizzled, requires_grad=False
            )

-            assert layer.w2_weight_scale.shape[2] % 16 == 0, (
-                "Expected weight_scale.dim(1) to be divisible by 16"
-            )
-            assert layer.w2_weight_scale.dtype == torch.float8_e4m3fn, (
-                "Weight Blockscale must be represented as FP8-E4M3"
-            )
            w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
            layer.w2_weight_scale = Parameter(
                w2_blockscale_swizzled, requires_grad=False