v0.2.0 (#330)

Co-authored-by: jinz2014 <7799920+jinz2014@users.noreply.github.com> Co-authored-by: Jin Z <5zj@cousteau.ftpn.ornl.gov>

v0.2.0 (#330)
Co-authored-by: jinz2014 <7799920+jinz2014@users.noreply.github.com> Co-authored-by: Jin Z <5zj@cousteau.ftpn.ornl.gov>
bcaa8a36 · Casper · GitHub · c69d3b65 · bcaa8a36 · bcaa8a36
Unverified Commit bcaa8a36 authored Feb 15, 2024 by Casper Committed by GitHub Feb 15, 2024
20 changed files
--- a/awq/utils/fused_utils.py
+++ b/awq/utils/fused_utils.py
--- a/awq/utils/module.py
+++ b/awq/utils/module.py
 import torch.nn as nn
 def get_named_linears(module):
    return {name: m for name, m in module.named_modules() if isinstance(m, nn.Linear)}
 def get_op_by_name(module, op_name):
    # get the op by its name relative to the module
    for name, m in module.named_modules():
@@ -12,10 +14,10 @@ def get_op_by_name(module, op_name):
 def set_op_by_name(layer, name, new_module):
-    levels = name.split('.')
+    levels = name.split(".")
    if len(levels) > 1:
        mod_ = layer
-        for l_idx in range(len(levels)-1):
+        for l_idx in range(len(levels) - 1):
            if levels[l_idx].isdigit():
                mod_ = mod_[int(levels[l_idx])]
            else:
@@ -43,6 +45,7 @@ def append_str_prefix(x, prefix):
    else:
        return x
 def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
    if modules_to_not_convert is None:
        return linear_layers
@@ -51,4 +54,4 @@ def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
    for name, linear_layer in linear_layers.items():
        if not any(key in name for key in modules_to_not_convert):
            filtered_layers[name] = linear_layer
    return filtered_layers
\ No newline at end of file
--- a/awq/utils/packing_utils.py
+++ b/awq/utils/packing_utils.py
@@ -79,6 +79,7 @@ def unpack_reorder_pack(qweight, qzeros, bits):
    return qweight, qzeros
 def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
    # Unpack the qweight and qzeros tensors
    iweight, izeros = unpack_awq(qweight, qzeros, bits)
@@ -94,4 +95,4 @@ def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
    izeros = izeros.repeat_interleave(group_size, dim=0)
    iweight = (iweight - izeros) * scales
    return iweight
\ No newline at end of file
--- a/awq/utils/parallel.py
+++ b/awq/utils/parallel.py
@@ -23,6 +23,7 @@ def auto_parallel(args):
    else:
        cuda_visible_devices = list(range(8))
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        [str(dev) for dev in cuda_visible_devices[:n_gpu]])
+        [str(dev) for dev in cuda_visible_devices[:n_gpu]]
+    )
    logging.debug("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
    return cuda_visible_devices
--- a/awq/utils/quant_utils.py
+++ b/awq/utils/quant_utils.py
@@ -115,7 +115,7 @@ def dequantize(imatrix, scales, zeros, group_size):
    ) * scales.repeat_interleave(group_size, dim=0)
    fmatrix = fmatrix.to(torch.float16)
    return fmatrix

--- a/awq/utils/utils.py
+++ b/awq/utils/utils.py
--- a/docs/examples.md
+++ b/docs/examples.md
--- a/docs/index.md
+++ b/docs/index.md
--- a/docs/reference/index.md
+++ b/docs/reference/index.md
+# Auto and Base model classes in AutoAWQ
+View the documentation of the main classes of AutoAWQ models below.
+::: awq.models.auto.AutoAWQForCausalLM
+::: awq.models.base.BaseAWQForCausalLM
--- a/examples/README.md
+++ b/examples/README.md
--- a/examples/awq_to_gguf_quant.py
+++ b/examples/awq_to_gguf_quant.py
--- a/examples/basic_transformers.py
+++ b/examples/basic_transformers.py
--- a/examples/basic_vllm.py
+++ b/examples/basic_vllm.py
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
--- a/examples/exllama_generate.py
+++ b/examples/exllama_generate.py
--- a/examples/basic_generate.py
+++ b/examples/basic_generate.py
--- a/examples/llava_generate.py
+++ b/examples/llava_generate.py
--- a/examples/llava_quant.py
+++ b/examples/llava_quant.py
--- a/examples/marlin_generate.py
+++ b/examples/marlin_generate.py
--- a/examples/marlin_quant.py
+++ b/examples/marlin_quant.py