Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files

Deleted tests/init.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...
Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files
f8070792 · yangql · a2630e0f · a2630e0f · a2630e0f · a2630e0f
Commit f8070792 authored Oct 23, 2024 by yangql
12 changed files
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/bench_autoawq_autogptq.py
+++ b/tests/bench_autoawq_autogptq.py
-import torch
-
-
-try:
-    from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
-except ModuleNotFoundError as e:
-    raise ModuleNotFoundError(
-        f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this benchmark. {e}"
-    )
-
-import numpy as np
-
-from auto_gptq.modeling._utils import autogptq_post_init
-from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
-from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
-
-
-group_size = 128
-bits = 4
-
-# Yi 34B down_proj
-k = 20480
-n = 7168
-
-device = torch.device("cuda:0")
-
-linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
-
-linear_gptq = linear_class(
-    bits=bits,
-    group_size=group_size,
-    infeatures=k,
-    outfeatures=n,
-    bias=False,
-)
-
-assert isinstance(linear_gptq, QuantLinear)
-
-linear_gptq = linear_gptq.eval()
-linear_gptq = linear_gptq.to(device)
-
-linear_gptq = autogptq_post_init(linear_gptq, use_act_order=False)
-
-num_runs = 60
-
-lines = []
-
-seqlens = [
-    1,
-    2,
-    3,
-    4,
-    5,
-    6,
-    7,
-    8,
-    12,
-    16,
-    24,
-    32,
-    48,
-    64,
-    80,
-    120,
-    250,
-    512,
-    1024,
-    2048,
-    4000,
-    8000,
-]
-
-print(f"in_features={k}, out_features={n}")
-for query_length in seqlens:
-    # batch_size, query_length, hidden_size
-    inp = torch.rand(1, query_length, k, dtype=torch.float16).to(device)
-
-    torch.cuda.empty_cache()
-
-    # Warmup Exllama v2
-    with torch.no_grad():
-        res = linear_gptq(inp)
-
-    latencies = []
-    torch.cuda.synchronize()
-    for _ in range(num_runs):
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        torch.cuda.synchronize()
-        start_event.record()
-
-        res = linear_gptq(inp)
-
-        end_event.record()
-        torch.cuda.synchronize()
-
-        latency_ms = start_event.elapsed_time(end_event)
-        latencies.append(latency_ms)
-
-    # print("-------")
-    # print(f"Latency GPTQ Exllama v2 (query_length={query_length}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
-
-    exllamav2_mean_latency = np.mean(latencies)
-    exllamav2_p10 = np.percentile(latencies, 10)
-    exllamav2_p90 = np.percentile(latencies, 90)
-
-    torch.cuda.empty_cache()
-
-    total_seqlen = inp.shape[:-1].numel()
-    if total_seqlen <= 8:
-        awq_kernel = "GEMV"
-        linear_awq = WQLinear_GEMV(
-            w_bit=bits,
-            group_size=group_size,
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dev=device,
-        )
-    else:
-        awq_kernel = "GEMM"
-        linear_awq = WQLinear_GEMM(
-            w_bit=bits,
-            group_size=group_size,
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dev=device,
-        )
-
-    # Warmup AWQ
-    with torch.no_grad():
-        res = linear_awq(inp)
-
-    latencies = []
-    torch.cuda.synchronize()
-    for _ in range(num_runs):
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        torch.cuda.synchronize()
-        start_event.record()
-
-        res = linear_awq(inp)
-
-        end_event.record()
-        torch.cuda.synchronize()
-
-        latency_ms = start_event.elapsed_time(end_event)
-        latencies.append(latency_ms)
-
-    awq_mean_latency = np.mean(latencies)
-    awq_p10 = np.percentile(latencies, 10)
-    awq_p90 = np.percentile(latencies, 90)
-
-    exllama_speedup = awq_mean_latency / exllamav2_mean_latency
-
-    # print(f"Latency AWQ (query_length={query_length}, kernel={awq_kernel}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
-
-    line = "{},{},{},{},{},{},{},{},{},{},{}".format(
-        bits,
-        group_size,
-        total_seqlen,
-        awq_kernel,
-        f"{awq_mean_latency:.3f}",
-        f"{exllamav2_mean_latency:.3f}",
-        f"{awq_p10:.3f}",
-        f"{awq_p90:.3f}",
-        f"{exllamav2_p10:.3f}",
-        f"{exllamav2_p90:.3f}",
-        f"{exllama_speedup:.3f}",
-    )
-    lines.append(line)
-
-
-header = "bits, group_size, total_seqlen, awq_kernel, awq_mean_latency (ms), exllamav2_mean_latency (ms), awq_p10, awq_p90, exllamav2_p10, exllamav2_p90, exllama_speedup"
-
-print(header)
-for line in lines:
-    print(line)
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
-[pytest]
-addopts=-s -v
-log_cli=true
--- a/tests/test_awq_compatibility_generation.py
+++ b/tests/test_awq_compatibility_generation.py
-# ruff: noqa: I001
-import unittest
-
-import torch
-import autogptq_cuda_64
-import autogptq_cuda_256
-from transformers import AutoTokenizer
-
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQLinear
-
-
-try:
-    from awq import AutoAWQForCausalLM
-except ModuleNotFoundError as e:
-    AutoAWQForCausalLM = None
-    AWQ_EXCEPTION = e
-
-
-class TestAwqCompatibility(unittest.TestCase):
-    # TODO: test cuda-old fp16.
-    # TODO: test cuda-old fp32.
-    # TODO: test exllama v2.
-
-    def test_generation_cuda_old_fp32_pytorch(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            torch_dtype=torch.float32,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-
-        prompt = "I am in Paris and I am going to see the"
-
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-
-        awq_output = tokenizer.decode(awq_output[0])
-
-        self.assertTrue(awq_output == autogptq_output)
-
-    def test_generation_cuda_old_cuda_256(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-
-        for torch_dtype in [torch.float16, torch.float32]:
-            model_autogptq = AutoGPTQForCausalLM.from_quantized(
-                quant_path,
-                device=device,
-                use_triton=False,
-                inject_fused_attention=False,
-                inject_fused_mlp=False,
-                disable_exllama=True,
-                disable_exllamav2=True,
-                torch_dtype=torch_dtype,
-            )
-
-            for name, module in model_autogptq.named_modules():
-                if isinstance(module, CudaOldQLinear):
-                    self.assertTrue(module.autogptq_cuda == autogptq_cuda_256)
-
-                    if torch_dtype == torch.float32:
-                        self.assertFalse(module.use_cuda_fp16)
-                    else:
-                        self.assertTrue(module.use_cuda_fp16)
-
-            inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-            autogptq_output = tokenizer.decode(autogptq_output[0])
-
-            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-
-            awq_output = model_awq.generate(
-                **inp,
-                num_beams=1,
-                min_new_tokens=30,
-                max_new_tokens=30,
-            )
-
-            awq_output = tokenizer.decode(awq_output[0])
-
-            self.assertTrue(awq_output == autogptq_output)
-
-    def test_generation_cuda_old_cuda_64(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-
-        for torch_dtype in [torch.float16, torch.float32]:
-            model_autogptq = AutoGPTQForCausalLM.from_quantized(
-                quant_path,
-                device=device,
-                use_triton=False,
-                inject_fused_attention=False,
-                inject_fused_mlp=False,
-                disable_exllama=True,
-                disable_exllamav2=True,
-                torch_dtype=torch_dtype,
-            )
-
-            # Force autogptq_cuda_64.
-            for name, module in model_autogptq.named_modules():
-                if isinstance(module, CudaOldQLinear):
-                    if module.autogptq_cuda != autogptq_cuda_64:
-                        module.autogptq_cuda = autogptq_cuda_64
-
-                    if torch_dtype == torch.float32:
-                        self.assertFalse(module.use_cuda_fp16)
-                    else:
-                        self.assertTrue(module.use_cuda_fp16)
-
-            inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-            autogptq_output = tokenizer.decode(autogptq_output[0])
-
-            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-
-            awq_output = model_awq.generate(
-                **inp,
-                num_beams=1,
-                min_new_tokens=30,
-                max_new_tokens=30,
-            )
-
-            awq_output = tokenizer.decode(awq_output[0])
-
-            self.assertTrue(awq_output == autogptq_output)
-
-    def test_generation_exllama(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            disable_exllama=False,
-            disable_exllamav2=True,
-            torch_dtype=torch.float16,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-
-        prompt = "I am in Paris and I am going to see the"
-
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-
-        awq_output = tokenizer.decode(awq_output[0])
-
-        self.assertTrue(awq_output == autogptq_output)
-
-    def test_generation_exllamav2(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            torch_dtype=torch.float16,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-
-        prompt = "I am in Paris and I am going to see the"
-
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-
-        awq_output = tokenizer.decode(awq_output[0])
-
-        self.assertTrue(awq_output == autogptq_output)
--- a/tests/test_hpu_linear.py
+++ b/tests/test_hpu_linear.py
-import numpy as np
-import math
-import torch
-import pytest
-try:
-    import habana_frameworks.torch.core as htcore
-except Exception as e:
-    pytestmark = pytest.mark.skip("Couldn't import HPU plugin, skipping HPU tests")
-
-def _convert_to_tensor_list(tensor_or_tensors):
-    if isinstance(tensor_or_tensors, tuple):
-        return list(tensor_or_tensors)
-    elif isinstance(tensor_or_tensors, list):
-        return tensor_or_tensors
-    elif isinstance(tensor_or_tensors, torch.Tensor):
-        # You can't return list(tensor_or_tensors), because it will fail on 0-d tensors
-        result_list = []
-        result_list.append(tensor_or_tensors)
-        return result_list
-    else:
-        raise TypeError("Can not convert outputs")
-
-def compare_tensors(hpu_tensors, cpu_tensors, atol, rtol, assert_enable=True):
-    hpu_tensors = _convert_to_tensor_list(hpu_tensors)
-    cpu_tensors = _convert_to_tensor_list(cpu_tensors)
-    assert len(hpu_tensors) == len(cpu_tensors)
-
-    hpu_tensors = [tensor.to('cpu') if tensor is not None else tensor for tensor in hpu_tensors]
-
-    for i in range(len(hpu_tensors)):
-        if cpu_tensors[i] is None and hpu_tensors[i] is None:
-            continue
-
-        hpu_tensors[i] = (
-            hpu_tensors[i].float()
-            if hpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
-            else hpu_tensors[i]
-        )
-        cpu_tensors[i] = (
-            cpu_tensors[i].float()
-            if cpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
-            else cpu_tensors[i]
-        )
-        if assert_enable:
-            np.testing.assert_allclose(
-                hpu_tensors[i].detach().numpy(),
-                cpu_tensors[i].detach().numpy(),
-                atol=atol,
-                rtol=rtol,
-            )
-        else:
-            print("hpu_result[{}]".format(i), hpu_tensors[i].detach().numpy())
-            print("cpu_result[{}]".format(i), cpu_tensors[i].detach().numpy())
-            return np.allclose(
-                hpu_tensors[i].detach().numpy(),
-                cpu_tensors[i].detach().numpy(),
-                atol=atol,
-                rtol=rtol,
-                equal_nan=True,
-            )
-
-# taken from AutoGPTQ/tests/test_repacking.py
-def gen_quant4(k, n, groupsize=-1, bias=False):
-    maxq = 2 ** 4 - 1
-    w = torch.randn((k, n), dtype=torch.bfloat16, device="cpu")
-
-    original_w = w.clone()
-
-    if groupsize != -1:
-        w = w.reshape((-1, groupsize, n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((groupsize, -1))
-
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / maxq
-
-    # Quantize.
-    w = torch.round(w / s).int()
-
-    # Unsigned storage.
-    w += (maxq + 1) // 2
-    w = torch.clamp(w, 0, maxq)
-
-    # Dequantize.
-    ref = (w - (maxq + 1) // 2).bfloat16() * s
-
-    if groupsize != -1:
-        def reshape(w):
-            w = w.reshape((groupsize, -1, n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((k, n)).contiguous()
-            return w
-        ref = reshape(ref)
-        w = reshape(w)
-
-    s = s.reshape((-1, n)).contiguous()
-    linear = torch.nn.Linear(k, n, bias=bias)
-    linear.weight.data = ref.t()
-
-    return original_w, linear, s
-
-@pytest.mark.parametrize("bits", [4])
-@pytest.mark.parametrize("group_size", [16, 32, 128])
-@pytest.mark.parametrize("infeatures", [64, 128, 512, 4096, 11008])
-@pytest.mark.parametrize("outfeatures", [64, 128, 512, 4096, 11008])
-@pytest.mark.parametrize("bias", [True, False], ids=["bias", "no_bias"])
-@pytest.mark.parametrize("scales_value, weight_value, zeros_value", [("normal", "normal", "normal"), ("normal", "normal", "range"), ("normal", "normal", "zeros"), ("ones", "zeros", "zeros"), ("ones", "zeros", "eights"), ("ones", "range", "zeros"), ("ones", "range", "ones"), ("ones", "7", "ones"), ("ones", "zeros", "range"),("ones", "zeros", "ones"), ("ones", "range", "range"), ("range", "range", "range"), ("range", "range", "zeros")])
-@pytest.mark.parametrize("weight_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
-def test_qlinear_hpu(bits, group_size, infeatures, outfeatures, bias, scales_value, weight_value, zeros_value, weight_dtype):
-    qweight_shape_0 = infeatures // 32 * bits
-    qzeros_shape_0 = math.ceil(infeatures / group_size)
-    qzeros_shape_1 = outfeatures // 32 * bits
-    if qweight_shape_0 == 0 or qzeros_shape_0 == 0 or qzeros_shape_1 == 0:
-        pytest.skip(f"{qweight_shape_0=} == 0 or {qzeros_shape_0=} == 0 or {qzeros_shape_1=} == 0")
-    if infeatures < group_size:
-        pytest.skip(f"{infeatures=} < {group_size=}")
-    if infeatures != outfeatures:
-        pytest.skip(f"{infeatures=} != {outfeatures=}")
-    trainable = False
-    use_cuda_fp16 = False
-    kernel_switch_threshold = 128
-    from auto_gptq.nn_modules.qlinear import qlinear_hpu, qlinear_cuda_old
-    quant_hpu = qlinear_hpu.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
-    # Cuda old implementation is the reference, also runs on hpu
-    quant_ref_cuda_old = qlinear_cuda_old.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
-    input = torch.rand((infeatures, outfeatures), dtype=weight_dtype).to("hpu")
-    _, linear, s = gen_quant4(infeatures, outfeatures, group_size, bias)
-
-    if scales_value == "ones":
-        s = torch.ones_like(s)
-    if scales_value == "range":
-        range_t = torch.tensor(list(range(1, s.numel()+1)), dtype=torch.int32)
-        shape_s = s.shape
-        s = (torch.ones(s.numel()) * range_t).reshape(shape_s).contiguous()
-
-    if weight_value == "ones":
-        linear.weight = torch.nn.Parameter(torch.ones_like(linear.weight))
-    elif weight_value == "zeros":
-        linear.weight = torch.nn.Parameter(torch.zeros_like(linear.weight))
-    elif weight_value == "range":
-        shape_w = linear.weight.shape
-        weight_local = torch.ones(shape_w, dtype=torch.int32)
-        range_t_weight = torch.tensor(list(range(0, 8)), dtype=torch.int32)
-        linear.weight = torch.nn.Parameter((torch.ones(weight_local.numel(), dtype=linear.weight.dtype).reshape(-1, 8) * range_t_weight).reshape(shape_w).contiguous())
-    elif weight_value.isnumeric():
-        linear.weight = torch.nn.Parameter(torch.full_like(linear.weight, int(weight_value)))
-    linear.weight = torch.nn.Parameter(linear.weight.to(weight_dtype))
-
-    if zeros_value == "zeros":
-        zeros = torch.full((infeatures // group_size, outfeatures), 0, dtype=torch.int32)
-    elif zeros_value == "range":
-        zeros = torch.ones((infeatures // group_size, outfeatures), dtype=torch.int32)
-        range_t_zeros = torch.tensor(list(range(1, 9)), dtype=torch.int32)
-        shape_z = zeros.shape
-        zeros = (torch.ones(zeros.numel(), dtype=torch.int32).reshape(-1, 8) * range_t_zeros).reshape(shape_z).contiguous()
-    elif zeros_value == "eights":
-        zeros = torch.full((infeatures // group_size, outfeatures), 8, dtype=torch.int32)
-    else:
-        zeros = torch.full((infeatures // group_size, outfeatures), 1, dtype=torch.int32)
-
-    htcore.mark_step()
-
-    quant_ref_cuda_old.pack(linear, s.clone().detach().T, zeros.clone().detach().T, g_idx=None)
-    htcore.mark_step()
-    quant_ref_cuda_old.to("hpu")
-
-    #TODO: pack independently
-    quant_hpu.set_packed(quant_ref_cuda_old)
-    htcore.mark_step()
-    quant_hpu.to("hpu")
-
-    out_ref_cuda_old = quant_ref_cuda_old(input)
-    htcore.mark_step()
-    quant_hpu.post_init()
-    htcore.mark_step()
-    out_hpu = quant_hpu(input)
-    htcore.mark_step()
-
-    out_ref_cuda_old = out_ref_cuda_old.cpu()
-    out_hpu = out_hpu.cpu()
-    compare_tensors(out_hpu.cpu(), out_ref_cuda_old.cpu(), rtol = 1e-05, atol = 1e-08)
--- a/tests/test_peft_conversion.py
+++ b/tests/test_peft_conversion.py
-import math
-from unittest import TestCase
-
-import torch.cuda.amp
-from peft import TaskType
-from peft.peft_model import PeftModelForCausalLM
-from torch.optim import Adam
-from transformers import AutoTokenizer
-
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.utils.peft_utils import (
-    GPTQAdaLoraConfig,
-    GPTQLoraConfig,
-    GPTQLoraLinear,
-    GPTQSVDLinear,
-    get_gptq_peft_model,
-)
-
-
-MODEL_NAME = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
-
-
-class TestPeftConversion(TestCase):
-    def check_model_trainable(self, model_lora: PeftModelForCausalLM, tokenizer: AutoTokenizer) -> None:
-        batch = tokenizer("Hello, world", return_tensors="pt")
-        batch = {key: value.to(model_lora.device) for key, value in batch.items()}
-        batch["labels"] = batch["input_ids"]
-        batch["attention_mask"] = batch["attention_mask"].float()
-        batch["attention_mask"].requires_grad = True
-        model_lora.gradient_checkpointing_enable()
-        optimizer = Adam(model_lora.parameters(), lr=1e-4)
-        model_lora.train()
-        losses = []
-        for _ in range(30):
-            optimizer.zero_grad()
-            with torch.cuda.amp.autocast():
-                loss = model_lora(**batch).loss
-            losses.append(loss.item())
-            loss.backward()
-            optimizer.step()
-        self.assertTrue(losses[0] > losses[-1])
-        self.assertTrue(all(math.isfinite(loss) for loss in losses))
-        self.assertTrue(not any(math.isnan(loss) for loss in losses))
-
-    def test_lora_conversion(self):
-        model = AutoGPTQForCausalLM.from_quantized(
-            MODEL_NAME,
-            use_triton=False,
-            warmup_triton=False,
-            trainable=True,
-            inject_fused_attention=True,
-            inject_fused_mlp=False,
-            use_safetensors=True,
-        )
-        peft_config = GPTQLoraConfig(
-            r=16,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=False,
-            target_modules=["qkv_proj"],
-        )
-        model_lora = get_gptq_peft_model(
-            model,
-            peft_config,
-            adapter_name="test",
-            auto_find_all_linears=False,
-            train_mode=True,
-        )
-        linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
-        self.assertTrue(isinstance(linear_layer, GPTQLoraLinear))
-
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        self.check_model_trainable(model_lora, tokenizer)
-
-    def test_adalora_conversion(self):
-        model = AutoGPTQForCausalLM.from_quantized(
-            MODEL_NAME,
-            use_triton=False,
-            warmup_triton=False,
-            trainable=True,
-            inject_fused_attention=True,
-            inject_fused_mlp=False,
-            use_safetensors=True,
-        )
-        peft_config = GPTQAdaLoraConfig(
-            init_r=20,
-            target_r=16,
-            beta1=0.85,
-            beta2=0.85,
-            tinit=200,
-            tfinal=1000,
-            deltaT=10,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=False,
-            target_modules=["qkv_proj"],
-        )
-        model_lora = get_gptq_peft_model(
-            model,
-            peft_config,
-            adapter_name="test",
-            auto_find_all_linears=False,
-            train_mode=True,
-        )
-        linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
-        self.assertTrue(isinstance(linear_layer, GPTQSVDLinear))
-
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        self.check_model_trainable(model_lora, tokenizer)
--- a/tests/test_q4.py
+++ b/tests/test_q4.py
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
-import os
-import tempfile
-import unittest
-
-import torch.cuda
-from parameterized import parameterized
-from transformers import AutoTokenizer
-
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.quantization import CHECKPOINT_FORMAT, QUANT_CONFIG_FILENAME, BaseQuantizeConfig
-
-
-class TestQuantization(unittest.TestCase):
-    @parameterized.expand([(False,), (True,)])
-    def test_quantize(self, use_marlin: bool):
-        pretrained_model_dir = "saibo/llama-1B"
-
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-        examples = [
-            tokenizer(
-                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
-            ),
-            tokenizer(
-                "Today I am in Paris and it is a wonderful day."
-            ),
-        ]
-
-        quantize_config = BaseQuantizeConfig(
-            bits=4,
-            group_size=128,
-            desc_act=False,
-            checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else CHECKPOINT_FORMAT.GPTQ,
-        )
-
-        model = AutoGPTQForCausalLM.from_pretrained(
-            pretrained_model_dir,
-            quantize_config=quantize_config,
-            use_flash_attention_2=False,
-        )
-
-        model.quantize(examples)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", use_marlin=use_marlin)
-            del model
-            torch.cuda.empty_cache()
-
-            # test compat: 1) with simple dict type 2) is_marlin_format
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "desc_act": False,
-                "is_marlin_format": use_marlin,
-            }
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", quantize_config=compat_quantize_config)
-            assert(isinstance(model.quantize_config, BaseQuantizeConfig))
-
-            del model
-            torch.cuda.empty_cache()
-
-            # test checkinpoint_format hint to from_quantized()
-            os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
-
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "desc_act": False,
-            }
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0",
-                    quantize_config=compat_quantize_config,
-                    checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else None)
-            assert (isinstance(model.quantize_config, BaseQuantizeConfig))
--- a/tests/test_repacking.py
+++ b/tests/test_repacking.py
-import copy
-import unittest
-
-import autogptq_marlin_cuda
-import torch
-import torch.nn as nn
-
-from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_marlin import _get_perms, dequantize_weight
-
-
-def gen_quant4(k, n, groupsize=-1):
-    maxq = 2 ** 4 - 1
-    w = torch.randn((k, n), dtype=torch.half, device="cpu")
-
-    original_w = w.clone()
-
-    if groupsize != -1:
-        w = w.reshape((-1, groupsize, n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((groupsize, -1))
-
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / maxq
-
-    # Quantize.
-    w = torch.round(w / s).int()
-
-    # Unsigned storage.
-    w += (maxq + 1) // 2
-    w = torch.clamp(w, 0, maxq)
-
-    # Dequantize.
-    ref = (w - (maxq + 1) // 2).half() * s
-
-    if groupsize != -1:
-        def reshape(w):
-            w = w.reshape((groupsize, -1, n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((k, n)).contiguous()
-            return w
-        ref = reshape(ref)
-        w = reshape(w)
-
-    s = s.reshape((-1, n)).contiguous()
-    linear = nn.Linear(k, n, bias=False)
-    linear.weight.data = ref.t()
-
-    return original_w, linear, s
-
-original_w, linear, s = gen_quant4(64, 128)
-
-class TestRepacking(unittest.TestCase):
-    def test_marlin_fast_repacking(self):
-        k = 2048
-        n = 1024
-        m = 5
-        group_size = 128
-
-        _, linear, s = gen_quant4(k, n, group_size)
-        cuda_old_linear = CudaOldQuantLinear(bits=4, group_size=group_size, infeatures=k, outfeatures=n, bias=False)
-
-        zeros = torch.full((k // group_size, n), 8, dtype=torch.int32)
-
-        cuda_old_linear.pack(linear, s.T, zeros.T, g_idx=None)
-
-        # Adapted from utils.marlin_utils.convert_to_marlin
-        dequantized_weight, dequantized_qzeros = dequantize_weight(cuda_old_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-
-        linear_module = torch.nn.Linear(
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dtype=torch.float16,
-            device="cuda",
-        )
-        linear_module.weight.data.copy_(linear.weight.data)  # Not using dequantized_weight to avoid approx
-
-        # Create new linear method and copy to model.
-        marlin_linear = MarlinQuantLinear(
-            bits=4,
-            group_size=group_size,
-            infeatures=k,
-            outfeatures=n,
-            bias=False,
-            trainable=False,
-        )
-
-        marlin_linear.pack(linear_module.to("cuda"), scales=copy.deepcopy(cuda_old_linear.scales.data.t()).to("cuda"))
-
-        inp = torch.rand(m, k, dtype=torch.float16, device="cuda")
-
-        cuda_old_linear = cuda_old_linear.to("cuda")
-        marlin_linear = marlin_linear.to("cuda")
-        with torch.no_grad():
-            res_cuda_old = cuda_old_linear(inp)
-            res_marlin = marlin_linear(inp)
-
-        reldiff = (res_cuda_old - res_marlin).abs() / (res_cuda_old.abs() + 1e-12)
-        self.assertTrue(torch.mean(reldiff) < 4e-3)
-
-        weight_repacked = autogptq_marlin_cuda.gptq_repack(cuda_old_linear.qweight)
-        self.assertTrue(torch.allclose(weight_repacked, marlin_linear.B))
-
-        _, _scale_perm, _scale_perm_single = _get_perms()
-
-        s = cuda_old_linear.scales.data.clone()
-        if group_size != k:
-            s = s.reshape((1, -1))
-            s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
-        else:
-            s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
-        s = s.reshape((-1, n)).contiguous()
-
-        self.assertTrue(torch.allclose(s, marlin_linear.s))
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
-import json
-import os
-import tempfile
-import time
-import unittest
-
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.quantization import CHECKPOINT_FORMAT, CHECKPOINT_FORMAT_FIELD, QUANT_CONFIG_FILENAME
-from auto_gptq.quantization.config import QUANT_METHOD, BaseQuantizeConfig
-
-
-class TestSerialization(unittest.TestCase):
-    MODEL_ID = "habanoz/TinyLlama-1.1B-Chat-v0.3-GPTQ"
-
-    def setUp(self):
-        dummy_config = BaseQuantizeConfig(
-            model_name_or_path=self.MODEL_ID,
-            quant_method=QUANT_METHOD.GPTQ,
-            checkpoint_format=CHECKPOINT_FORMAT.MARLIN)
-
-        model_cache_path, is_cached = dummy_config.get_cache_file_path()
-
-        if is_cached:
-            os.remove(model_cache_path)
-
-    def test_marlin_local_serialization(self):
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        end = time.time()
-        first_load_time = end - start
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "model.safetensors")))
-            model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
-            self.assertFalse(os.path.isfile(os.path.join(tmpdir, model_cache_path)))
-
-            with open(os.path.join(tmpdir, QUANT_CONFIG_FILENAME), "r") as config_file:
-                config = json.load(config_file)
-
-            self.assertTrue(config[CHECKPOINT_FORMAT_FIELD] == CHECKPOINT_FORMAT.MARLIN)
-
-            start = time.time()
-            model = AutoGPTQForCausalLM.from_quantized(tmpdir, device="cuda:0", use_marlin=True)
-            end = time.time()
-            second_load_time = end - start
-
-        # Since we use a CUDA kernel to repack weights, the first load time is already small.
-        self.assertTrue(second_load_time < first_load_time)
-
-    def test_marlin_hf_cache_serialization(self):
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
-        end = time.time()
-        first_load_time = end - start
-
-        model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
-        self.assertTrue("assets" in model_cache_path)
-        self.assertTrue(is_cached)
-
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
-        end = time.time()
-        second_load_time = end - start
-
-        # Since we use a CUDA kernel to repack weights, the first load time is already small.
-        self.assertTrue(second_load_time < first_load_time)
--- a/tests/test_sharded_loading.py
+++ b/tests/test_sharded_loading.py
-import unittest
-
-from transformers import AutoTokenizer
-
-from auto_gptq import AutoGPTQForCausalLM
-
-
-class TestShardedLoading(unittest.TestCase):
-
-    def test_loading(self):
-        model_name = "TheBlokeAI/llama-68m-GPTQ-sharded"
-
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
-
-        model = AutoGPTQForCausalLM.from_quantized(model_name, device='cuda:0',)
-
-        tokens = model.generate(**tokenizer("1337", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
-        result = tokenizer.decode(tokens)
-
-        self.assertTrue(result == '<s> 133777777777777777777777')
-
-    def test_loading_large(self):
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4")
-
-        model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4", device='cuda:0')
-
-        tokens = model.generate(**tokenizer("Today I am in Paris and", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
-        result = tokenizer.decode(tokens)
-
-        self.assertTrue(result == 'Today I am in Paris and I am going to the Louvre Museum. I want to see the Mona Lisa painting, but I')
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
-import os
-import unittest
-
-import torch
-import torch.utils.benchmark as benchmark
-from transformers import AutoTokenizer
-
-from auto_gptq import AutoGPTQForCausalLM
-
-
-MODEL_ID = "TheBloke/Llama-7B-GPTQ"
-DATASET_ID = "timdettmers/openassistant-guanaco"
-LEARNING_RATE = 3e-5
-MAX_SEQ_LEN = 10
-BATCH_SIZE = 5
-NUM_TRAIN_STEPS = 10
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-def benchmark_forward(
-    fn,
-    *inputs,
-    repeats="auto",
-    desc="",
-    verbose=True,
-    amp=False,
-    amp_dtype=torch.float16,
-    **kwinputs,
-):
-    if verbose:
-        print(desc, "- Forward pass")
-
-    def amp_wrapper(*inputs, **kwinputs):
-        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
-            fn(*inputs, **kwinputs)
-
-    t = benchmark.Timer(
-        stmt="fn_amp(*inputs, **kwinputs)",
-        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
-        num_threads=torch.get_num_threads(),
-    )
-    if repeats == "auto":
-        m = t.blocked_autorange()
-    else:
-        m = t.timeit(repeats)
-    if verbose:
-        print(m)
-    return t, m
-
-def get_model_and_tokenizer(
-    model_id=MODEL_ID,
-    inject_fused_attention=False,
-    inject_fused_mlp=False,
-    **model_kwargs,
-):
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_ID,
-        use_fast=True,
-    )
-    if not tokenizer.pad_token_id:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-
-    model = AutoGPTQForCausalLM.from_quantized(
-        model_id,
-        trainable=True,
-        inject_fused_attention=inject_fused_attention,
-        inject_fused_mlp=inject_fused_mlp,
-        disable_exllamav2=True,
-        disable_exllama=True,
-        **model_kwargs,
-    )
-
-    model.warmup_triton()
-    return model, tokenizer
-
-class TestTriton(unittest.TestCase):
-    def test_triton_qlinear(self):
-        ref_model, _ = get_model_and_tokenizer(
-            model_id=MODEL_ID,
-            use_triton=True,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-        )
-        test_model, _ = get_model_and_tokenizer(
-            model_id=MODEL_ID,
-            use_tritonv2=True,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-        )
-        hidden_size = ref_model.model.model.embed_tokens.weight.shape[1]
-        test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda()
-
-        qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
-        qlinear_test = test_model.model.model.layers[0].self_attn.q_proj
-
-        test_out = qlinear_test(test_data)
-        ref_out = qlinear_ref(test_data)
-
-        self.assertTrue(torch.allclose(test_out, ref_out))
-
-        _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)
-        _, measure_tritonv2 = benchmark_forward(qlinear_test, test_data, desc="Triton-v2", verbose=True)
-
-        self.assertTrue(measure_tritonv2.mean < measure_triton.mean)