Reintroduce memory usage fix (#9535)

433266c1 · fzyzcjy · GitHub · fda47926 · 433266c1
Unverified Commit 433266c1 authored Aug 25, 2025 by fzyzcjy Committed by GitHub Aug 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +2 -0

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1212,11 +1212,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):

            # Process w13 weights
            w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
+            del layer.w13_weight_scale
            layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled)
            layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)

            # Process w2 weights
            w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+            del layer.w2_weight_scale
            layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled)
            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)