Reduce memory usage for fp4 moe (#8413)

7df2c0c2 · fzyzcjy · GitHub · 69712e6f · 7df2c0c2
Unverified Commit 7df2c0c2 authored Jul 29, 2025 by fzyzcjy Committed by GitHub Jul 28, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +2 -0

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -900,6 +900,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
        layer.w13_blockscale_swizzled = Parameter(
            w13_blockscale_swizzled, requires_grad=False
        )
+        del layer.w13_weight_scale
        # This is for quantization, so we need to invert it.
        layer.w13_input_scale_quant = Parameter(
@@ -935,6 +936,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
        layer.w2_blockscale_swizzled = Parameter(
            w2_blockscale_swizzled, requires_grad=False
        )
+        del layer.w2_weight_scale
        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
        device = layer.w13_weight.device