Commit d32095ab authored by Abhinav Kulkarni's avatar Abhinav Kulkarni
Browse files

[Major] Add CPU offloading support for apply_scale, apply_clip,...

[Major] Add CPU offloading support for apply_scale, apply_clip, pseudo_quantize_model_weight, real_quantize_model_weight
parent 95cd9c2d
...@@ -75,9 +75,11 @@ def auto_clip_block(module, ...@@ -75,9 +75,11 @@ def auto_clip_block(module,
# due to qk bmm, it is hard to clip precisely # due to qk bmm, it is hard to clip precisely
if any([_ in name for _ in ["q_", "k_", "query", "key", "Wqkv"]]): if any([_ in name for _ in ["q_", "k_", "query", "key", "Wqkv"]]):
continue continue
named_linears[name].cuda()
max_val = auto_clip_layer( max_val = auto_clip_layer(
named_linears[name].weight, input_feat[name], n_bit=w_bit, q_config=q_config) named_linears[name].weight, input_feat[name], n_bit=w_bit, q_config=q_config)
clip_list.append((name, max_val)) clip_list.append((name, max_val))
named_linears[name].cpu()
return clip_list return clip_list
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment