Merge pull request #33 from abhinavkulkarni/dev/more_models

3b9f2875 · Jiaming Tang · GitHub · ab536fb1 · d2a10bd9 · 3b9f2875
Unverified Commit 3b9f2875 authored Jul 10, 2023 by Jiaming Tang Committed by GitHub Jul 10, 2023
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 3 deletions

awq/entry.py awq/entry.py +2 -2

awq/quantize/auto_scale.py awq/quantize/auto_scale.py +6 -1

awq/quantize/pre_quant.py awq/quantize/pre_quant.py +6 -0

No files found.
--- a/awq/entry.py
+++ b/awq/entry.py
@@ -73,9 +73,9 @@ def build_model_and_enc(model_path):
    # all hf model
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    if "mpt" in config.__class__.__name__.lower():
-        enc = AutoTokenizer.from_pretrained(config.tokenizer_name)
+        enc = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
    else:
-        enc = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        enc = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)

    if args.load_quant:  # directly load quantized weights
        print("Loading pre-computed quantized weights...")

--- a/awq/quantize/auto_scale.py
+++ b/awq/quantize/auto_scale.py
@@ -107,11 +107,14 @@ def auto_scale_block(module, module_kwargs,
    def _search_module_scale(block, linears2scale: list, x, kwargs={}):
        # w: co, ci
        # x: n, ci
-        x = x.to(next(block.parameters()).device)
        weight = torch.cat([_m.weight for _m in linears2scale], dim=0)
        w_max = get_weight_scale(
            weight, q_group_size=q_config.get("q_group_size", -1))
+        # Clear GPU memory
+        del weight
+        torch.cuda.empty_cache()

+        x = x.to(next(block.parameters()).device)
        with torch.no_grad():
            org_out = block(x, **kwargs)
            if isinstance(org_out, tuple):
@@ -126,6 +129,8 @@ def auto_scale_block(module, module_kwargs,
        n_grid = 20
        history = []

+        # Clear GPU memory
+        torch.cuda.empty_cache()
        org_sd = {k: v.cpu() for k, v in block.state_dict().items()}
        for ratio in range(n_grid):
            ratio = ratio * 1 / n_grid

--- a/awq/quantize/pre_quant.py
+++ b/awq/quantize/pre_quant.py
@@ -135,6 +135,9 @@ def run_awq(
        # now solve for scaling and clipping
        input_feat = {k: torch.cat(v, dim=0) for k, v in input_feat.items()}

+        # Clear GPU memory
+        torch.cuda.empty_cache()
+
        if auto_scale:  # if it applies, we should also modify the input_feat with scales
            scales_list = auto_scale_block(
                layer, layer_kwargs,
@@ -146,6 +149,9 @@ def run_awq(
            # append prefix to make names global
            awq_results["scale"] += append_str_prefix(scales_list, get_op_name(model, layer) + ".")

+        # Clear GPU memory
+        torch.cuda.empty_cache()
+        
        if mse_range:
            clip_list = auto_clip_block(layer,
                            w_bit=w_bit, q_config=q_config,