Unverified Commit 980de31c authored by Reinforce-II's avatar Reinforce-II Committed by GitHub
Browse files

[bugfix] remove unused parameters to reduce unnecessary vram usage (#26789)


Signed-off-by: default avatarReinforce-II <fate@eastal.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
parent 1c160841
...@@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): ...@@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
layer.w13_weight = torch.nn.Parameter( layer.w13_weight = torch.nn.Parameter(
layer.w13_weight_packed.data, requires_grad=False layer.w13_weight_packed.data, requires_grad=False
) )
delattr(layer, "w13_weight_packed")
layer.w2_weight = torch.nn.Parameter( layer.w2_weight = torch.nn.Parameter(
layer.w2_weight_packed.data, requires_grad=False layer.w2_weight_packed.data, requires_grad=False
) )
delattr(layer, "w2_weight_packed")
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
if self.allow_flashinfer: if self.allow_flashinfer:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment