Unverified Commit 7df2c0c2 authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Reduce memory usage for fp4 moe (#8413)

parent 69712e6f
...@@ -900,6 +900,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): ...@@ -900,6 +900,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
layer.w13_blockscale_swizzled = Parameter( layer.w13_blockscale_swizzled = Parameter(
w13_blockscale_swizzled, requires_grad=False w13_blockscale_swizzled, requires_grad=False
) )
del layer.w13_weight_scale
# This is for quantization, so we need to invert it. # This is for quantization, so we need to invert it.
layer.w13_input_scale_quant = Parameter( layer.w13_input_scale_quant = Parameter(
...@@ -935,6 +936,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): ...@@ -935,6 +936,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
layer.w2_blockscale_swizzled = Parameter( layer.w2_blockscale_swizzled = Parameter(
w2_blockscale_swizzled, requires_grad=False w2_blockscale_swizzled, requires_grad=False
) )
del layer.w2_weight_scale
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
device = layer.w13_weight.device device = layer.w13_weight.device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment