Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files

Deleted tests/init.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...
Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files
f8070792 · yangql · a2630e0f · a2630e0f · a2630e0f · a2630e0f
Commit f8070792 authored Oct 23, 2024 by yangql
12 changed files
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/bench_autoawq_autogptq.py
+++ b/tests/bench_autoawq_autogptq.py
-import torch
-try:
-    from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
-except ModuleNotFoundError as e:
-    raise ModuleNotFoundError(
-        f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this benchmark. {e}"
-    )
-import numpy as np
-from auto_gptq.modeling._utils import autogptq_post_init
-from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
-from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
-group_size = 128
-bits = 4
-# Yi 34B down_proj
-k = 20480
-n = 7168
-device = torch.device("cuda:0")
-linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
-linear_gptq = linear_class(
-    bits=bits,
-    group_size=group_size,
-    infeatures=k,
-    outfeatures=n,
-    bias=False,
-)
-assert isinstance(linear_gptq, QuantLinear)
-linear_gptq = linear_gptq.eval()
-linear_gptq = linear_gptq.to(device)
-linear_gptq = autogptq_post_init(linear_gptq, use_act_order=False)
-num_runs = 60
-lines = []
-seqlens = [
-    1,
-    2,
-    3,
-    4,
-    5,
-    6,
-    7,
-    8,
-    12,
-    16,
-    24,
-    32,
-    48,
-    64,
-    80,
-    120,
-    250,
-    512,
-    1024,
-    2048,
-    4000,
-    8000,
-]
-print(f"in_features={k}, out_features={n}")
-for query_length in seqlens:
-    # batch_size, query_length, hidden_size
-    inp = torch.rand(1, query_length, k, dtype=torch.float16).to(device)
-    torch.cuda.empty_cache()
-    # Warmup Exllama v2
-    with torch.no_grad():
-        res = linear_gptq(inp)
-    latencies = []
-    torch.cuda.synchronize()
-    for _ in range(num_runs):
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        torch.cuda.synchronize()
-        start_event.record()
-        res = linear_gptq(inp)
-        end_event.record()
-        torch.cuda.synchronize()
-        latency_ms = start_event.elapsed_time(end_event)
-        latencies.append(latency_ms)
-    # print("-------")
-    # print(f"Latency GPTQ Exllama v2 (query_length={query_length}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
-    exllamav2_mean_latency = np.mean(latencies)
-    exllamav2_p10 = np.percentile(latencies, 10)
-    exllamav2_p90 = np.percentile(latencies, 90)
-    torch.cuda.empty_cache()
-    total_seqlen = inp.shape[:-1].numel()
-    if total_seqlen <= 8:
-        awq_kernel = "GEMV"
-        linear_awq = WQLinear_GEMV(
-            w_bit=bits,
-            group_size=group_size,
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dev=device,
-        )
-    else:
-        awq_kernel = "GEMM"
-        linear_awq = WQLinear_GEMM(
-            w_bit=bits,
-            group_size=group_size,
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dev=device,
-        )
-    # Warmup AWQ
-    with torch.no_grad():
-        res = linear_awq(inp)
-    latencies = []
-    torch.cuda.synchronize()
-    for _ in range(num_runs):
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        torch.cuda.synchronize()
-        start_event.record()
-        res = linear_awq(inp)
-        end_event.record()
-        torch.cuda.synchronize()
-        latency_ms = start_event.elapsed_time(end_event)
-        latencies.append(latency_ms)
-    awq_mean_latency = np.mean(latencies)
-    awq_p10 = np.percentile(latencies, 10)
-    awq_p90 = np.percentile(latencies, 90)
-    exllama_speedup = awq_mean_latency / exllamav2_mean_latency
-    # print(f"Latency AWQ (query_length={query_length}, kernel={awq_kernel}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
-    line = "{},{},{},{},{},{},{},{},{},{},{}".format(
-        bits,
-        group_size,
-        total_seqlen,
-        awq_kernel,
-        f"{awq_mean_latency:.3f}",
-        f"{exllamav2_mean_latency:.3f}",
-        f"{awq_p10:.3f}",
-        f"{awq_p90:.3f}",
-        f"{exllamav2_p10:.3f}",
-        f"{exllamav2_p90:.3f}",
-        f"{exllama_speedup:.3f}",
-    )
-    lines.append(line)
-header = "bits, group_size, total_seqlen, awq_kernel, awq_mean_latency (ms), exllamav2_mean_latency (ms), awq_p10, awq_p90, exllamav2_p10, exllamav2_p90, exllama_speedup"
-print(header)
-for line in lines:
-    print(line)
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
-[pytest]
-addopts=-s -v
-log_cli=true
--- a/tests/test_awq_compatibility_generation.py
+++ b/tests/test_awq_compatibility_generation.py
-# ruff: noqa: I001
-import unittest
-import torch
-import autogptq_cuda_64
-import autogptq_cuda_256
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQLinear
-try:
-    from awq import AutoAWQForCausalLM
-except ModuleNotFoundError as e:
-    AutoAWQForCausalLM = None
-    AWQ_EXCEPTION = e
-class TestAwqCompatibility(unittest.TestCase):
-    # TODO: test cuda-old fp16.
-    # TODO: test cuda-old fp32.
-    # TODO: test exllama v2.
-    def test_generation_cuda_old_fp32_pytorch(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            torch_dtype=torch.float32,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-        awq_output = tokenizer.decode(awq_output[0])
-        self.assertTrue(awq_output == autogptq_output)
-    def test_generation_cuda_old_cuda_256(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-        for torch_dtype in [torch.float16, torch.float32]:
-            model_autogptq = AutoGPTQForCausalLM.from_quantized(
-                quant_path,
-                device=device,
-                use_triton=False,
-                inject_fused_attention=False,
-                inject_fused_mlp=False,
-                disable_exllama=True,
-                disable_exllamav2=True,
-                torch_dtype=torch_dtype,
-            )
-            for name, module in model_autogptq.named_modules():
-                if isinstance(module, CudaOldQLinear):
-                    self.assertTrue(module.autogptq_cuda == autogptq_cuda_256)
-                    if torch_dtype == torch.float32:
-                        self.assertFalse(module.use_cuda_fp16)
-                    else:
-                        self.assertTrue(module.use_cuda_fp16)
-            inp = tokenizer(prompt, return_tensors="pt").to(device)
-            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-            autogptq_output = tokenizer.decode(autogptq_output[0])
-            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-            awq_output = model_awq.generate(
-                **inp,
-                num_beams=1,
-                min_new_tokens=30,
-                max_new_tokens=30,
-            )
-            awq_output = tokenizer.decode(awq_output[0])
-            self.assertTrue(awq_output == autogptq_output)
-    def test_generation_cuda_old_cuda_64(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-        for torch_dtype in [torch.float16, torch.float32]:
-            model_autogptq = AutoGPTQForCausalLM.from_quantized(
-                quant_path,
-                device=device,
-                use_triton=False,
-                inject_fused_attention=False,
-                inject_fused_mlp=False,
-                disable_exllama=True,
-                disable_exllamav2=True,
-                torch_dtype=torch_dtype,
-            )
-            # Force autogptq_cuda_64.
-            for name, module in model_autogptq.named_modules():
-                if isinstance(module, CudaOldQLinear):
-                    if module.autogptq_cuda != autogptq_cuda_64:
-                        module.autogptq_cuda = autogptq_cuda_64
-                    if torch_dtype == torch.float32:
-                        self.assertFalse(module.use_cuda_fp16)
-                    else:
-                        self.assertTrue(module.use_cuda_fp16)
-            inp = tokenizer(prompt, return_tensors="pt").to(device)
-            autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-            autogptq_output = tokenizer.decode(autogptq_output[0])
-            model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-            awq_output = model_awq.generate(
-                **inp,
-                num_beams=1,
-                min_new_tokens=30,
-                max_new_tokens=30,
-            )
-            awq_output = tokenizer.decode(awq_output[0])
-            self.assertTrue(awq_output == autogptq_output)
-    def test_generation_exllama(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            disable_exllama=False,
-            disable_exllamav2=True,
-            torch_dtype=torch.float16,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-        awq_output = tokenizer.decode(awq_output[0])
-        self.assertTrue(awq_output == autogptq_output)
-    def test_generation_exllamav2(self):
-        if AutoAWQForCausalLM is None:
-            self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
-        device = torch.device("cuda:0")
-        quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
-        model_autogptq = AutoGPTQForCausalLM.from_quantized(
-            quant_path,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            torch_dtype=torch.float16,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(quant_path)
-        prompt = "I am in Paris and I am going to see the"
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        for name, submodule in model_autogptq.named_modules():
-            if isinstance(submodule, CudaOldQLinear):
-                # Just a hack to test the handmade pytorch implementation path.
-                submodule.autogptq_cuda_available = False
-        autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
-        autogptq_output = tokenizer.decode(autogptq_output[0])
-        model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
-        awq_output = model_awq.generate(
-            **inp,
-            num_beams=1,
-            min_new_tokens=30,
-            max_new_tokens=30,
-        )
-        awq_output = tokenizer.decode(awq_output[0])
-        self.assertTrue(awq_output == autogptq_output)
--- a/tests/test_hpu_linear.py
+++ b/tests/test_hpu_linear.py
-import numpy as np
-import math
-import torch
-import pytest
-try:
-    import habana_frameworks.torch.core as htcore
-except Exception as e:
-    pytestmark = pytest.mark.skip("Couldn't import HPU plugin, skipping HPU tests")
-def _convert_to_tensor_list(tensor_or_tensors):
-    if isinstance(tensor_or_tensors, tuple):
-        return list(tensor_or_tensors)
-    elif isinstance(tensor_or_tensors, list):
-        return tensor_or_tensors
-    elif isinstance(tensor_or_tensors, torch.Tensor):
-        # You can't return list(tensor_or_tensors), because it will fail on 0-d tensors
-        result_list = []
-        result_list.append(tensor_or_tensors)
-        return result_list
-    else:
-        raise TypeError("Can not convert outputs")
-def compare_tensors(hpu_tensors, cpu_tensors, atol, rtol, assert_enable=True):
-    hpu_tensors = _convert_to_tensor_list(hpu_tensors)
-    cpu_tensors = _convert_to_tensor_list(cpu_tensors)
-    assert len(hpu_tensors) == len(cpu_tensors)
-    hpu_tensors = [tensor.to('cpu') if tensor is not None else tensor for tensor in hpu_tensors]
-    for i in range(len(hpu_tensors)):
-        if cpu_tensors[i] is None and hpu_tensors[i] is None:
-            continue
-        hpu_tensors[i] = (
-            hpu_tensors[i].float()
-            if hpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
-            else hpu_tensors[i]
-        )
-        cpu_tensors[i] = (
-            cpu_tensors[i].float()
-            if cpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
-            else cpu_tensors[i]
-        )
-        if assert_enable:
-            np.testing.assert_allclose(
-                hpu_tensors[i].detach().numpy(),
-                cpu_tensors[i].detach().numpy(),
-                atol=atol,
-                rtol=rtol,
-            )
-        else:
-            print("hpu_result[{}]".format(i), hpu_tensors[i].detach().numpy())
-            print("cpu_result[{}]".format(i), cpu_tensors[i].detach().numpy())
-            return np.allclose(
-                hpu_tensors[i].detach().numpy(),
-                cpu_tensors[i].detach().numpy(),
-                atol=atol,
-                rtol=rtol,
-                equal_nan=True,
-            )
-# taken from AutoGPTQ/tests/test_repacking.py
-def gen_quant4(k, n, groupsize=-1, bias=False):
-    maxq = 2 ** 4 - 1
-    w = torch.randn((k, n), dtype=torch.bfloat16, device="cpu")
-    original_w = w.clone()
-    if groupsize != -1:
-        w = w.reshape((-1, groupsize, n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((groupsize, -1))
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / maxq
-    # Quantize.
-    w = torch.round(w / s).int()
-    # Unsigned storage.
-    w += (maxq + 1) // 2
-    w = torch.clamp(w, 0, maxq)
-    # Dequantize.
-    ref = (w - (maxq + 1) // 2).bfloat16() * s
-    if groupsize != -1:
-        def reshape(w):
-            w = w.reshape((groupsize, -1, n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((k, n)).contiguous()
-            return w
-        ref = reshape(ref)
-        w = reshape(w)
-    s = s.reshape((-1, n)).contiguous()
-    linear = torch.nn.Linear(k, n, bias=bias)
-    linear.weight.data = ref.t()
-    return original_w, linear, s
-@pytest.mark.parametrize("bits", [4])
-@pytest.mark.parametrize("group_size", [16, 32, 128])
-@pytest.mark.parametrize("infeatures", [64, 128, 512, 4096, 11008])
-@pytest.mark.parametrize("outfeatures", [64, 128, 512, 4096, 11008])
-@pytest.mark.parametrize("bias", [True, False], ids=["bias", "no_bias"])
-@pytest.mark.parametrize("scales_value, weight_value, zeros_value", [("normal", "normal", "normal"), ("normal", "normal", "range"), ("normal", "normal", "zeros"), ("ones", "zeros", "zeros"), ("ones", "zeros", "eights"), ("ones", "range", "zeros"), ("ones", "range", "ones"), ("ones", "7", "ones"), ("ones", "zeros", "range"),("ones", "zeros", "ones"), ("ones", "range", "range"), ("range", "range", "range"), ("range", "range", "zeros")])
-@pytest.mark.parametrize("weight_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
-def test_qlinear_hpu(bits, group_size, infeatures, outfeatures, bias, scales_value, weight_value, zeros_value, weight_dtype):
-    qweight_shape_0 = infeatures // 32 * bits
-    qzeros_shape_0 = math.ceil(infeatures / group_size)
-    qzeros_shape_1 = outfeatures // 32 * bits
-    if qweight_shape_0 == 0 or qzeros_shape_0 == 0 or qzeros_shape_1 == 0:
-        pytest.skip(f"{qweight_shape_0=} == 0 or {qzeros_shape_0=} == 0 or {qzeros_shape_1=} == 0")
-    if infeatures < group_size:
-        pytest.skip(f"{infeatures=} < {group_size=}")
-    if infeatures != outfeatures:
-        pytest.skip(f"{infeatures=} != {outfeatures=}")
-    trainable = False
-    use_cuda_fp16 = False
-    kernel_switch_threshold = 128
-    from auto_gptq.nn_modules.qlinear import qlinear_hpu, qlinear_cuda_old
-    quant_hpu = qlinear_hpu.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
-    # Cuda old implementation is the reference, also runs on hpu
-    quant_ref_cuda_old = qlinear_cuda_old.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
-    input = torch.rand((infeatures, outfeatures), dtype=weight_dtype).to("hpu")
-    _, linear, s = gen_quant4(infeatures, outfeatures, group_size, bias)
-    if scales_value == "ones":
-        s = torch.ones_like(s)
-    if scales_value == "range":
-        range_t = torch.tensor(list(range(1, s.numel()+1)), dtype=torch.int32)
-        shape_s = s.shape
-        s = (torch.ones(s.numel()) * range_t).reshape(shape_s).contiguous()
-    if weight_value == "ones":
-        linear.weight = torch.nn.Parameter(torch.ones_like(linear.weight))
-    elif weight_value == "zeros":
-        linear.weight = torch.nn.Parameter(torch.zeros_like(linear.weight))
-    elif weight_value == "range":
-        shape_w = linear.weight.shape
-        weight_local = torch.ones(shape_w, dtype=torch.int32)
-        range_t_weight = torch.tensor(list(range(0, 8)), dtype=torch.int32)
-        linear.weight = torch.nn.Parameter((torch.ones(weight_local.numel(), dtype=linear.weight.dtype).reshape(-1, 8) * range_t_weight).reshape(shape_w).contiguous())
-    elif weight_value.isnumeric():
-        linear.weight = torch.nn.Parameter(torch.full_like(linear.weight, int(weight_value)))
-    linear.weight = torch.nn.Parameter(linear.weight.to(weight_dtype))
-    if zeros_value == "zeros":
-        zeros = torch.full((infeatures // group_size, outfeatures), 0, dtype=torch.int32)
-    elif zeros_value == "range":
-        zeros = torch.ones((infeatures // group_size, outfeatures), dtype=torch.int32)
-        range_t_zeros = torch.tensor(list(range(1, 9)), dtype=torch.int32)
-        shape_z = zeros.shape
-        zeros = (torch.ones(zeros.numel(), dtype=torch.int32).reshape(-1, 8) * range_t_zeros).reshape(shape_z).contiguous()
-    elif zeros_value == "eights":
-        zeros = torch.full((infeatures // group_size, outfeatures), 8, dtype=torch.int32)
-    else:
-        zeros = torch.full((infeatures // group_size, outfeatures), 1, dtype=torch.int32)
-    htcore.mark_step()
-    quant_ref_cuda_old.pack(linear, s.clone().detach().T, zeros.clone().detach().T, g_idx=None)
-    htcore.mark_step()
-    quant_ref_cuda_old.to("hpu")
-    #TODO: pack independently
-    quant_hpu.set_packed(quant_ref_cuda_old)
-    htcore.mark_step()
-    quant_hpu.to("hpu")
-    out_ref_cuda_old = quant_ref_cuda_old(input)
-    htcore.mark_step()
-    quant_hpu.post_init()
-    htcore.mark_step()
-    out_hpu = quant_hpu(input)
-    htcore.mark_step()
-    out_ref_cuda_old = out_ref_cuda_old.cpu()
-    out_hpu = out_hpu.cpu()
-    compare_tensors(out_hpu.cpu(), out_ref_cuda_old.cpu(), rtol = 1e-05, atol = 1e-08)
--- a/tests/test_peft_conversion.py
+++ b/tests/test_peft_conversion.py
-import math
-from unittest import TestCase
-import torch.cuda.amp
-from peft import TaskType
-from peft.peft_model import PeftModelForCausalLM
-from torch.optim import Adam
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.utils.peft_utils import (
-    GPTQAdaLoraConfig,
-    GPTQLoraConfig,
-    GPTQLoraLinear,
-    GPTQSVDLinear,
-    get_gptq_peft_model,
-)
-MODEL_NAME = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
-class TestPeftConversion(TestCase):
-    def check_model_trainable(self, model_lora: PeftModelForCausalLM, tokenizer: AutoTokenizer) -> None:
-        batch = tokenizer("Hello, world", return_tensors="pt")
-        batch = {key: value.to(model_lora.device) for key, value in batch.items()}
-        batch["labels"] = batch["input_ids"]
-        batch["attention_mask"] = batch["attention_mask"].float()
-        batch["attention_mask"].requires_grad = True
-        model_lora.gradient_checkpointing_enable()
-        optimizer = Adam(model_lora.parameters(), lr=1e-4)
-        model_lora.train()
-        losses = []
-        for _ in range(30):
-            optimizer.zero_grad()
-            with torch.cuda.amp.autocast():
-                loss = model_lora(**batch).loss
-            losses.append(loss.item())
-            loss.backward()
-            optimizer.step()
-        self.assertTrue(losses[0] > losses[-1])
-        self.assertTrue(all(math.isfinite(loss) for loss in losses))
-        self.assertTrue(not any(math.isnan(loss) for loss in losses))
-    def test_lora_conversion(self):
-        model = AutoGPTQForCausalLM.from_quantized(
-            MODEL_NAME,
-            use_triton=False,
-            warmup_triton=False,
-            trainable=True,
-            inject_fused_attention=True,
-            inject_fused_mlp=False,
-            use_safetensors=True,
-        )
-        peft_config = GPTQLoraConfig(
-            r=16,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=False,
-            target_modules=["qkv_proj"],
-        )
-        model_lora = get_gptq_peft_model(
-            model,
-            peft_config,
-            adapter_name="test",
-            auto_find_all_linears=False,
-            train_mode=True,
-        )
-        linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
-        self.assertTrue(isinstance(linear_layer, GPTQLoraLinear))
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        self.check_model_trainable(model_lora, tokenizer)
-    def test_adalora_conversion(self):
-        model = AutoGPTQForCausalLM.from_quantized(
-            MODEL_NAME,
-            use_triton=False,
-            warmup_triton=False,
-            trainable=True,
-            inject_fused_attention=True,
-            inject_fused_mlp=False,
-            use_safetensors=True,
-        )
-        peft_config = GPTQAdaLoraConfig(
-            init_r=20,
-            target_r=16,
-            beta1=0.85,
-            beta2=0.85,
-            tinit=200,
-            tfinal=1000,
-            deltaT=10,
-            lora_alpha=32,
-            lora_dropout=0.1,
-            task_type=TaskType.CAUSAL_LM,
-            inference_mode=False,
-            target_modules=["qkv_proj"],
-        )
-        model_lora = get_gptq_peft_model(
-            model,
-            peft_config,
-            adapter_name="test",
-            auto_find_all_linears=False,
-            train_mode=True,
-        )
-        linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
-        self.assertTrue(isinstance(linear_layer, GPTQSVDLinear))
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        self.check_model_trainable(model_lora, tokenizer)
--- a/tests/test_q4.py
+++ b/tests/test_q4.py
-import unittest
-import torch
-from parameterized import parameterized
-from auto_gptq.nn_modules.qlinear.qlinear_exllama import QuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear
-from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
-try:
-    from exllama_kernels import prepare_buffers, set_tuning_params
-except ImportError as e:
-    print(f"[WARNING] Could not load exllama_kernels: {e}")
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM, exllama_set_max_input_length
-from auto_gptq.modeling._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
-from auto_gptq.modeling._utils import autogptq_post_init
-def get_diff(a, ref):
-    eps = 1e-6
-    return f"Maxdiff: {(a - ref).abs().max()}, Mean relative diff: {((a - ref).abs() / (ref.abs() + eps)).mean()}"
-CUDA_OLD_REFERENCE = torch.Tensor(
-    [
-        5.8398,
-        6.8555,
-        7.2734,
-        6.4219,
-        6.2070,
-        5.8203,
-        6.5664,
-        6.4219,
-        6.2148,
-        5.3281,
-        5.7578,
-        7.5312,
-        8.1016,
-        6.1133,
-        7.2031,
-        6.6484,
-        6.5156,
-        6.0117,
-        6.0312,
-        6.1914,
-        6.2109,
-        6.8125,
-        5.8125,
-        7.1172,
-        7.3125,
-        6.7305,
-        5.9961,
-        6.5117,
-        6.1914,
-        5.9648,
-        7.1680,
-        6.4766,
-        7.2070,
-        6.5469,
-        6.7734,
-        6.4219,
-        6.8086,
-        7.0469,
-        5.9297,
-        6.4727,
-        6.2539,
-        5.9570,
-        7.2383,
-        5.8945,
-        6.0820,
-        5.7969,
-        7.1094,
-        6.2188,
-        6.7500,
-        7.3555,
-        6.2930,
-        6.7734,
-        5.9219,
-        7.4805,
-        6.8750,
-        6.4102,
-        6.5898,
-        6.5469,
-        7.6016,
-        6.7461,
-        5.9492,
-        7.2227,
-        5.8164,
-        5.4570,
-        6.2930,
-        7.3984,
-        6.0938,
-        7.3984,
-        5.9609,
-        6.3516,
-        6.5664,
-        5.7969,
-        7.1250,
-        6.0781,
-        6.7930,
-        5.9492,
-        6.1641,
-        6.5898,
-        6.0586,
-        6.3359,
-        6.7930,
-        7.0469,
-        6.0664,
-        6.3320,
-        5.4414,
-        6.7617,
-        5.1641,
-        7.2891,
-        6.8516,
-        6.5312,
-        5.6914,
-        7.3711,
-        6.8203,
-        5.9492,
-        7.0781,
-        6.3164,
-        7.1992,
-        7.1133,
-        7.4219,
-        7.5586,
-        7.1836,
-        6.9102,
-        6.4844,
-        6.9805,
-        6.1953,
-        6.5156,
-        5.4844,
-        6.6602,
-        6.6719,
-        7.9844,
-        6.4727,
-        6.6367,
-        6.2227,
-        6.4531,
-        5.0625,
-        6.4609,
-        6.7031,
-        6.6445,
-        6.5234,
-        6.8633,
-        6.6055,
-        5.6055,
-        6.4453,
-        7.2617,
-        6.3945,
-        6.6367,
-        6.1055,
-        7.0664,
-        6.0820,
-        6.6875,
-        6.1445,
-        6.8672,
-        6.2070,
-        6.8828,
-        6.1484,
-        6.7070,
-        6.8516,
-        6.2734,
-        7.1055,
-        7.0586,
-        6.9648,
-        5.9727,
-        6.1016,
-        6.8750,
-        7.0078,
-        7.1523,
-        5.7383,
-        5.9531,
-        6.5508,
-        7.5352,
-        6.1602,
-        6.2578,
-        6.3906,
-        5.7383,
-        6.7031,
-        5.7344,
-        6.3516,
-        5.2852,
-        7.5312,
-        6.4531,
-        6.6406,
-        6.2266,
-        6.1094,
-        5.9102,
-        5.7617,
-        6.3789,
-        7.0508,
-        6.3750,
-        6.3320,
-        6.8555,
-        6.7266,
-        7.0352,
-        7.7695,
-        6.3984,
-        6.5039,
-        6.8320,
-        6.1602,
-        6.0312,
-        6.3828,
-        6.9023,
-        7.4336,
-        7.3711,
-        6.1016,
-        7.0703,
-        6.3281,
-        6.8281,
-        6.4922,
-        5.9453,
-        5.1016,
-        6.7188,
-        6.1406,
-        6.6289,
-        7.2695,
-        6.2070,
-        6.7070,
-        7.2930,
-        7.1836,
-        6.3828,
-        6.1992,
-        6.7070,
-        7.8008,
-        7.7773,
-        5.6602,
-        7.0273,
-        6.6172,
-        6.0898,
-        5.3516,
-        7.3359,
-        5.9727,
-        6.0078,
-        7.0586,
-        6.3086,
-        6.8555,
-        7.2617,
-        7.3477,
-        6.3828,
-        7.1133,
-        6.6328,
-        7.3516,
-        6.9141,
-        7.2031,
-        6.9805,
-        6.1719,
-        6.7812,
-        8.3047,
-        6.5898,
-        6.3633,
-        6.2539,
-        7.2773,
-        6.5938,
-        6.4141,
-        6.8203,
-        6.8906,
-        7.8828,
-        5.9609,
-        6.4180,
-        7.3984,
-        5.7539,
-        7.1758,
-        6.6641,
-        6.9062,
-        6.2578,
-        7.5508,
-        6.1719,
-        6.5742,
-        5.9375,
-        6.7891,
-        6.2109,
-        6.5039,
-        6.8750,
-        6.2031,
-        6.8828,
-        7.1094,
-        5.9570,
-        7.2969,
-        6.6797,
-        6.8828,
-        5.5430,
-        6.9648,
-        5.8398,
-        6.5430,
-        6.3945,
-        6.5664,
-        5.8086,
-        6.6172,
-        7.0586,
-        6.8867,
-        6.0820,
-        5.8125,
-        6.7070,
-        7.5742,
-        6.2578,
-        6.1328,
-        6.5391,
-        5.4531,
-        6.8242,
-        6.6953,
-        6.8008,
-        6.3398,
-        6.4805,
-        7.2266,
-        6.3281,
-        6.6875,
-        6.4688,
-        5.9414,
-        7.4297,
-        5.8711,
-        6.0625,
-        5.8750,
-        6.5664,
-        5.8867,
-        6.3477,
-        6.1133,
-        6.9453,
-        5.0547,
-        6.7812,
-        6.4922,
-        7.2422,
-        5.4688,
-        6.2109,
-        7.2148,
-        6.1758,
-        5.9297,
-        7.1953,
-        5.5195,
-        6.3203,
-        5.9961,
-        7.9297,
-        6.2695,
-        6.4414,
-        6.7266,
-        7.1875,
-        7.3203,
-        5.4062,
-        6.0625,
-        7.0898,
-        5.3828,
-        5.6133,
-        6.0742,
-        6.6836,
-        5.7109,
-        7.2852,
-        7.7539,
-        7.5820,
-        6.4258,
-        5.9336,
-        6.3750,
-        6.3555,
-        7.5469,
-        6.2539,
-        6.5898,
-        6.4102,
-        7.0469,
-        5.7344,
-        7.2031,
-        6.7969,
-        5.6836,
-        7.6523,
-        6.9297,
-        7.8672,
-        6.4766,
-        6.3008,
-        7.0977,
-        6.5430,
-        7.0938,
-        5.8398,
-        6.9883,
-        6.5312,
-        6.3203,
-        6.3594,
-        5.4062,
-        6.9688,
-        5.7930,
-        6.3164,
-        6.5547,
-        7.1992,
-        5.8750,
-        6.3008,
-        6.7930,
-        6.0391,
-        7.4766,
-        6.6094,
-        6.5625,
-        5.9805,
-        6.2422,
-        7.2109,
-        6.6875,
-        5.3047,
-        7.6211,
-        5.9453,
-        6.5625,
-        6.1641,
-        6.1250,
-        6.5977,
-        7.7422,
-        7.0742,
-        5.6875,
-        6.2656,
-        6.6250,
-        6.8945,
-        5.7070,
-        6.3203,
-        5.7500,
-        6.2695,
-        6.2773,
-        6.8516,
-        6.4883,
-        7.0000,
-        6.7578,
-        6.1875,
-        5.9844,
-        5.5703,
-        6.7188,
-        5.5273,
-        5.3438,
-        7.2500,
-        6.7852,
-        6.5195,
-        6.8125,
-        6.0664,
-        6.7852,
-        7.0000,
-        7.0781,
-        6.8477,
-        7.2930,
-        6.3438,
-        7.1523,
-        6.3281,
-        6.8047,
-        7.3203,
-        5.3359,
-        6.1484,
-        6.5586,
-        7.3828,
-        6.2344,
-        7.1523,
-        6.4102,
-        5.5898,
-        7.0195,
-        7.1172,
-        5.8008,
-        6.5742,
-        6.2891,
-        8.0312,
-        6.9023,
-        6.5898,
-        7.1953,
-        6.7266,
-        6.0078,
-        5.5430,
-        6.4766,
-        6.4258,
-        5.9648,
-        8.0859,
-        5.0547,
-        7.2188,
-        7.4375,
-        6.5156,
-        5.9922,
-        6.3281,
-        6.2852,
-        6.7734,
-        6.2461,
-        6.9805,
-        5.4648,
-        5.8867,
-        6.8242,
-        6.3008,
-        6.3281,
-        7.3047,
-        7.1836,
-        6.5195,
-        6.6328,
-        6.7188,
-        5.4336,
-        6.5078,
-        5.3477,
-        5.5508,
-        7.3125,
-        5.8750,
-        6.5195,
-        6.2383,
-        6.3594,
-        6.0898,
-        6.4141,
-        5.9844,
-        6.6250,
-        7.7109,
-        6.0391,
-        7.2344,
-        5.9453,
-        5.9453,
-        7.0586,
-        5.6641,
-        7.2773,
-        6.5195,
-        7.2227,
-        6.3359,
-        5.3203,
-        6.4375,
-        7.2383,
-        6.4023,
-        6.2148,
-        7.3750,
-        5.8164,
-        6.2109,
-        6.5430,
-        5.8164,
-        6.1680,
-        6.7656,
-        6.0820,
-        6.1094,
-        6.5312,
-        6.8906,
-        6.8320,
-        6.1289,
-        6.3125,
-        7.6797,
-        6.3008,
-        6.0000,
-        7.3320,
-        6.7852,
-        6.9297,
-        6.6328,
-        6.2266,
-        5.1602,
-        6.2031,
-        7.0547,
-        5.9492,
-        6.0703,
-        6.0977,
-        6.8086,
-        6.0742,
-        6.0195,
-        7.0625,
-        6.5781,
-        5.7461,
-        6.1562,
-        7.0430,
-        6.7148,
-        6.5312,
-        6.5820,
-        6.4570,
-        7.5508,
-        5.6289,
-        6.0547,
-        6.5000,
-        7.3125,
-        5.8477,
-        5.9297,
-        6.2578,
-        6.0078,
-        5.9922,
-        7.3398,
-        7.4922,
-        7.8906,
-        7.5547,
-        5.4648,
-        6.5156,
-        6.3242,
-        6.1094,
-        6.9219,
-        6.7227,
-        6.6836,
-        7.4023,
-        5.9648,
-        7.2383,
-        6.7695,
-        6.6797,
-        7.0547,
-        6.3047,
-        6.4688,
-        6.9961,
-        6.0391,
-        5.9727,
-        6.8398,
-        6.7422,
-        5.7656,
-        5.4766,
-        6.7852,
-        7.0820,
-        5.3516,
-        7.6523,
-        5.1562,
-        6.6445,
-        6.1211,
-        6.2695,
-        6.0703,
-        6.3594,
-        6.4062,
-        6.3398,
-        5.7578,
-        6.5391,
-        6.2500,
-        6.5742,
-        6.5000,
-        7.5625,
-        7.0117,
-        6.5547,
-        7.1250,
-        6.4453,
-        6.6094,
-        6.1875,
-        6.4219,
-        6.6172,
-        6.4336,
-        6.5703,
-        6.1758,
-        6.4219,
-        6.6016,
-        6.7383,
-        6.7070,
-        6.1328,
-        5.5586,
-        6.6367,
-        6.3789,
-        6.2578,
-        5.5039,
-        6.6172,
-        6.4648,
-        5.8086,
-        7.2031,
-        5.8125,
-        6.3711,
-        7.6758,
-        7.1289,
-        5.8086,
-        6.3008,
-        6.2109,
-        6.1602,
-        6.1797,
-        7.2305,
-        6.7266,
-        6.2422,
-        5.6719,
-        6.7070,
-        6.9414,
-        6.8594,
-        7.4023,
-        7.2109,
-        6.0156,
-        6.6680,
-        6.6172,
-        7.1250,
-        6.6523,
-        6.9531,
-        6.7617,
-        6.4961,
-        6.9414,
-        5.7188,
-        7.6367,
-        6.5469,
-        6.2305,
-        6.4414,
-        7.4648,
-        5.9102,
-        6.2461,
-        6.1367,
-        6.8203,
-        6.5703,
-        6.8867,
-        7.0000,
-        6.7539,
-        6.1719,
-        6.5469,
-        6.2422,
-        5.4297,
-        5.7305,
-        5.1641,
-        6.1875,
-        7.0312,
-        6.6484,
-        6.0234,
-        7.4102,
-        6.8711,
-        6.3086,
-        6.3711,
-        6.7344,
-        6.6992,
-        5.9766,
-        7.3906,
-        7.1875,
-        6.4883,
-        6.3984,
-        7.3438,
-        6.9688,
-        6.9062,
-        6.4375,
-        6.7891,
-        7.0117,
-        6.4883,
-        5.7500,
-        7.0898,
-        7.0742,
-        6.7070,
-        5.8750,
-        6.0469,
-        6.6445,
-        5.2773,
-        6.8984,
-        6.1641,
-        7.0508,
-        7.4609,
-        5.0273,
-        6.7734,
-        6.4531,
-        5.7656,
-        6.5312,
-        7.4648,
-        6.1250,
-        6.5625,
-        7.1367,
-        6.0625,
-        6.1211,
-        6.9766,
-        6.6758,
-        6.3164,
-        6.8828,
-        6.8203,
-        6.7500,
-        6.5352,
-        7.3008,
-        6.7852,
-        6.1914,
-        5.0508,
-        6.7188,
-        7.1172,
-        6.8008,
-        6.8086,
-        5.4883,
-        6.9180,
-        6.5742,
-        6.1719,
-        7.0469,
-        7.1523,
-        5.9492,
-        5.8594,
-        6.8320,
-        6.1719,
-        6.2031,
-        6.8398,
-        7.3008,
-        6.6289,
-        6.4922,
-        6.0000,
-        5.4766,
-        6.3320,
-        6.5117,
-        6.2812,
-        7.5742,
-        6.3516,
-        7.0039,
-        6.4570,
-        7.1523,
-        7.6289,
-        6.2578,
-        7.1875,
-        6.4844,
-        5.7930,
-        6.7070,
-        7.5508,
-        7.1797,
-        6.0430,
-        6.8711,
-        6.5742,
-        7.5781,
-        6.4766,
-        6.5391,
-        6.9453,
-        6.1992,
-        6.6367,
-        6.2812,
-        6.0234,
-        6.6953,
-        7.0312,
-        6.2031,
-        6.5625,
-        6.6719,
-        6.1719,
-        6.5586,
-        5.7031,
-        7.4609,
-        6.6211,
-        7.7227,
-        6.9141,
-        6.0469,
-        6.2500,
-        5.3828,
-        6.0078,
-        5.8164,
-        5.8867,
-        6.1523,
-        6.6523,
-        6.6953,
-        7.3125,
-        6.4844,
-        5.9570,
-        5.9531,
-        6.2109,
-        5.5039,
-        6.5117,
-        6.8203,
-        6.6133,
-        6.4766,
-        5.9297,
-        7.1445,
-        7.1914,
-        6.0117,
-        6.8281,
-        6.7422,
-        6.1328,
-        6.9805,
-        6.5625,
-        6.9180,
-        7.1133,
-        7.3359,
-        5.7617,
-        5.8711,
-        6.4961,
-        6.5859,
-        6.2422,
-        6.5273,
-        6.7461,
-        6.6992,
-        6.7695,
-        6.6289,
-        5.9453,
-        5.9805,
-        7.1172,
-        6.6719,
-        6.0039,
-        7.6875,
-        6.7812,
-        7.8359,
-        6.9531,
-        7.4336,
-        7.6602,
-        6.8164,
-        7.3945,
-        7.1602,
-        6.8789,
-        5.0078,
-        6.0547,
-        6.8086,
-        6.7070,
-        6.4688,
-        6.4492,
-        6.6172,
-        5.5625,
-        6.6914,
-        6.4297,
-        5.7461,
-        5.3359,
-        6.8750,
-        6.4609,
-        7.4062,
-        5.2070,
-        6.0820,
-        6.7383,
-        6.5703,
-        6.1797,
-        6.7070,
-        6.5977,
-        5.9961,
-        6.6328,
-        6.9375,
-        6.3906,
-        6.6484,
-        4.9609,
-        6.6445,
-        6.5898,
-        7.1875,
-        7.5195,
-        6.7969,
-        6.1367,
-        6.8906,
-        7.4297,
-        6.3633,
-        6.0508,
-        6.5000,
-        6.4648,
-        6.7539,
-        6.7109,
-        5.8086,
-        6.6016,
-        7.1133,
-        4.8672,
-        6.6367,
-        6.1641,
-        5.1758,
-        6.9453,
-        6.3242,
-        7.0664,
-        6.4805,
-        6.3516,
-        6.7383,
-        8.4688,
-        6.7305,
-        5.9844,
-        6.5938,
-        7.2969,
-        6.5977,
-        7.5898,
-        6.2969,
-        6.8672,
-        6.6680,
-        7.1289,
-        6.6875,
-        5.4258,
-        8.1875,
-        8.0391,
-        7.7969,
-        6.6445,
-        7.0703,
-        7.3359,
-        6.9805,
-        6.6328,
-        6.5352,
-        6.2422,
-        5.5820,
-        6.8633,
-        6.8047,
-        6.5703,
-        6.0117,
-        6.7539,
-        7.1719,
-        6.8438,
-        7.3633,
-        6.6016,
-        7.2070,
-        6.4727,
-        5.8008,
-        7.4062,
-        7.4805,
-        6.6445,
-        5.9023,
-        6.3984,
-        6.9961,
-        6.6680,
-        6.8242,
-        6.7148,
-        6.6172,
-        6.9727,
-        6.8320,
-        5.9766,
-        6.6133,
-        5.5977,
-        6.7773,
-        7.3906,
-        6.9219,
-        7.0781,
-        6.6914,
-        5.7539,
-        6.7969,
-        6.8008,
-        5.8047,
-        7.1055,
-        6.4961,
-        6.0352,
-        5.6211,
-        7.4414,
-        7.0703,
-        6.1172,
-        6.7461,
-        6.4492,
-        7.7148,
-        6.4258,
-        6.0039,
-        6.5156,
-        7.2188,
-        7.4531,
-        7.4844,
-        7.5938,
-        7.4023,
-        6.7617,
-        6.0078,
-        6.3320,
-        5.8906,
-        7.5977,
-        5.6523,
-        6.7734,
-        6.3008,
-        5.2227,
-        7.1719,
-        7.1289,
-        6.6602,
-        5.4609,
-        7.0312,
-        6.0820,
-        6.1719,
-        6.0000,
-        6.5547,
-        6.6328,
-        7.0547,
-        7.0859,
-        6.2656,
-        5.5234,
-        6.0273,
-        6.7891,
-        7.1875,
-        6.9531,
-        6.8203,
-        6.3516,
-        6.1172,
-        6.4648,
-        6.9180,
-        7.3906,
-        6.2812,
-        5.7109,
-        6.1484,
-        6.9102,
-        6.8711,
-        7.0156,
-        6.1445,
-        5.8867,
-        6.3828,
-        5.9961,
-        6.6914,
-        6.7891,
-        7.0820,
-        6.6719,
-        6.9297,
-        6.3750,
-        6.7578,
-        6.4883,
-        6.2227,
-        6.2305,
-        6.0508,
-        6.6484,
-        5.7578,
-        7.2070,
-        7.2383,
-        6.9375,
-        7.2578,
-        6.5312,
-        6.0312,
-        6.7930,
-        6.2578,
-        7.0625,
-        7.2148,
-        6.4961,
-        7.0703,
-        6.4727,
-        7.3906,
-    ]
-).to(torch.float16)
-class TestsQ4Exllama(unittest.TestCase):
-    def test_exllama(self):
-        group_size = 128
-        m = 1
-        k = 1024
-        n = 1024
-        device = torch.device("cuda:0")
-        linear_class = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=False,
-            group_size=group_size,
-            bits=4,
-            disable_exllama=False,
-            disable_exllamav2=True,
-        )
-        linear = linear_class(
-            bits=4,
-            group_size=group_size,
-            infeatures=k,
-            outfeatures=n,
-            bias=False,
-        )
-        self.assertTrue(isinstance(linear, QuantLinear))
-        torch.manual_seed(42)
-        linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
-        linear.scales = linear.scales + 0.002
-        linear = linear.eval()
-        linear = linear.to(device)
-        linear = autogptq_post_init(linear, use_act_order=False)
-        max_inner_outer_dim = max(k, n)
-        max_dq_buffer_size = linear.infeatures * linear.outfeatures
-        max_input_len = 2048
-        buffers = {
-            "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
-            "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
-        }
-        prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
-        # Using the default from exllama repo here.
-        matmul_recons_thd = 8
-        matmul_fused_remap = False
-        matmul_no_half2 = False
-        set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-        inp = torch.rand(1, m, k, dtype=torch.float16).to(device)
-        with torch.no_grad():
-            res = linear(inp)[0][0]
-        reference = CUDA_OLD_REFERENCE.to(device)
-        self.assertTrue(
-            torch.allclose(res, reference, rtol=3e-5, atol=2e-2),
-            get_diff(res, reference),
-        )
-    def test_exllama_buffer_size(self):
-        prompt = "I am in Paris and" * 450
-        device = torch.device("cuda:0")
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-            disable_exllama=False,
-            disable_exllamav2=True,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        self.assertTrue(
-            inp["input_ids"].shape[1] > EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
-        )  # 2048 is the default max_input_length
-        with self.assertRaises(RuntimeError) as cm:
-            _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-        self.assertTrue("temp_state buffer is too small" in str(cm.exception))
-        model_q = exllama_set_max_input_length(model_q, 4096)
-        _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-        model_q = exllama_set_max_input_length(model_q, 1034)
-        with self.assertRaises(RuntimeError) as cm:
-            _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-        self.assertTrue("temp_state buffer is too small" in str(cm.exception))
-    def test_generation_no_act_order(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            disable_exllama=False,
-            disable_exllamav2=True,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_generation_with_act_order(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-            disable_exllama=False,
-            disable_exllamav2=True,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_multigpu(self):
-        # TODO
-        pass
-class TestsQ4CUDA(unittest.TestCase):
-    REFERENCE_OLD_HALF = torch.Tensor(
-        [
-            1.5332,
-            2.1250,
-            1.7910,
-            1.8008,
-            1.9688,
-            1.3262,
-            1.7627,
-            1.8164,
-            1.9307,
-            1.8574,
-            1.5449,
-            1.5293,
-            1.6074,
-            1.5566,
-            1.8545,
-            1.6582,
-            1.8838,
-            2.0215,
-            1.8525,
-            1.2920,
-            1.9561,
-            2.2617,
-            1.7891,
-            2.2656,
-            1.6543,
-            2.0566,
-            1.4756,
-            1.1826,
-            1.8174,
-            2.1191,
-            1.6641,
-            2.0586,
-            1.6182,
-            1.7627,
-            1.7920,
-            1.4424,
-            2.0723,
-            1.6865,
-            1.2979,
-            2.0840,
-            1.6729,
-            1.9648,
-            2.1602,
-            1.6006,
-            1.2773,
-            2.2129,
-            1.8057,
-            1.7285,
-            1.6621,
-            1.6475,
-            1.4805,
-            1.7959,
-            1.5010,
-            0.8643,
-            2.6680,
-            2.0918,
-            1.8555,
-            1.9795,
-            1.3271,
-            1.8359,
-            1.6338,
-            1.9766,
-            1.7881,
-            1.6025,
-            1.7637,
-            1.7012,
-            1.7852,
-            1.5674,
-            0.8091,
-            1.7188,
-            1.6123,
-            1.8525,
-            1.4434,
-            1.9590,
-            1.5801,
-            1.4209,
-            1.7178,
-            1.8408,
-            2.4141,
-            1.9658,
-            1.4922,
-            2.1992,
-            1.9473,
-            1.8047,
-            1.2979,
-            1.6396,
-            1.6221,
-            1.5020,
-            1.9941,
-            1.7725,
-            1.6064,
-            1.5449,
-            1.8418,
-            1.2656,
-            1.4824,
-            1.7734,
-            2.0098,
-            1.7197,
-            1.7686,
-            1.4160,
-            1.7275,
-            2.1738,
-            1.9609,
-            1.7686,
-            1.6396,
-            2.1465,
-            1.2188,
-            1.2002,
-            2.1113,
-            1.7227,
-            1.5811,
-            1.7607,
-            2.2773,
-            1.8945,
-            1.4111,
-            1.5801,
-            1.7744,
-            2.0684,
-            2.1621,
-            1.8027,
-            1.1045,
-            1.9648,
-            2.2402,
-            2.0742,
-            1.3330,
-            1.5840,
-            2.1465,
-            2.0176,
-            1.5068,
-            1.9834,
-            1.7725,
-            1.5527,
-            1.7803,
-            1.7744,
-            1.5312,
-            1.2695,
-            1.9209,
-            2.0469,
-            1.6777,
-            2.5215,
-            1.8389,
-            1.7598,
-            1.5498,
-            1.6807,
-            1.7324,
-            1.5938,
-            1.9268,
-            1.7734,
-            1.4463,
-            2.0391,
-            2.0527,
-            2.2129,
-            1.6787,
-            2.0586,
-            1.8975,
-            1.5713,
-            1.6992,
-            1.8770,
-            1.7207,
-            1.7080,
-            1.1611,
-            1.8584,
-            2.4570,
-            1.6016,
-            1.4834,
-            1.1777,
-            1.7969,
-            1.8955,
-            1.8906,
-            1.6738,
-            1.7510,
-            1.4316,
-            1.8340,
-            2.2461,
-            1.7744,
-            2.1934,
-            1.4824,
-            1.8828,
-            1.6387,
-            2.4629,
-            1.8887,
-            1.5137,
-            1.4648,
-            1.6406,
-            1.7188,
-            2.2656,
-            1.5801,
-            2.1484,
-            2.0625,
-            2.0098,
-            1.7549,
-            1.1768,
-            1.4385,
-            2.0723,
-            1.6172,
-            1.7832,
-            1.8301,
-            1.6064,
-            1.5215,
-            1.9297,
-            2.3750,
-            2.1504,
-            1.7070,
-            1.1289,
-            1.4473,
-            1.5674,
-            1.6836,
-            2.2930,
-            1.1221,
-            1.5557,
-            1.7559,
-            1.8281,
-            2.0703,
-            1.9443,
-            2.0684,
-            2.2988,
-            1.6348,
-            2.3379,
-            2.4414,
-            1.8857,
-            2.0039,
-            1.4844,
-            1.5488,
-            1.6514,
-            2.3711,
-            1.9941,
-            2.3066,
-            1.4287,
-            2.1777,
-            1.6445,
-            1.6025,
-            1.5938,
-            1.5508,
-            1.9502,
-            2.1309,
-            1.2666,
-            1.1523,
-            1.9561,
-            1.8584,
-            1.9746,
-            1.5986,
-            1.9688,
-            2.1973,
-            1.1523,
-            2.3281,
-            1.2451,
-            1.8447,
-            2.2051,
-            1.5254,
-            1.5342,
-            2.1016,
-            1.6523,
-            1.6279,
-            1.1680,
-            1.3037,
-            2.1035,
-        ]
-    ).to(torch.float16)
-    REFERENCE_OLD_NO_HALF = torch.Tensor(
-        [
-            1.5332,
-            2.1250,
-            1.7910,
-            1.7998,
-            1.9678,
-            1.3262,
-            1.7617,
-            1.8154,
-            1.9307,
-            1.8574,
-            1.5449,
-            1.5293,
-            1.6074,
-            1.5557,
-            1.8545,
-            1.6582,
-            1.8838,
-            2.0195,
-            1.8525,
-            1.2920,
-            1.9561,
-            2.2617,
-            1.7891,
-            2.2656,
-            1.6543,
-            2.0566,
-            1.4756,
-            1.1826,
-            1.8164,
-            2.1191,
-            1.6641,
-            2.0586,
-            1.6182,
-            1.7617,
-            1.7920,
-            1.4424,
-            2.0723,
-            1.6865,
-            1.2969,
-            2.0840,
-            1.6729,
-            1.9639,
-            2.1602,
-            1.5996,
-            1.2773,
-            2.2129,
-            1.8057,
-            1.7275,
-            1.6621,
-            1.6475,
-            1.4805,
-            1.7949,
-            1.5010,
-            0.8643,
-            2.6680,
-            2.0918,
-            1.8545,
-            1.9795,
-            1.3271,
-            1.8350,
-            1.6338,
-            1.9766,
-            1.7881,
-            1.6025,
-            1.7637,
-            1.7012,
-            1.7842,
-            1.5664,
-            0.8086,
-            1.7188,
-            1.6113,
-            1.8516,
-            1.4434,
-            1.9590,
-            1.5801,
-            1.4209,
-            1.7168,
-            1.8408,
-            2.4141,
-            1.9658,
-            1.4922,
-            2.1973,
-            1.9463,
-            1.8047,
-            1.2979,
-            1.6396,
-            1.6221,
-            1.5010,
-            1.9941,
-            1.7725,
-            1.6064,
-            1.5449,
-            1.8418,
-            1.2656,
-            1.4824,
-            1.7734,
-            2.0098,
-            1.7188,
-            1.7686,
-            1.4160,
-            1.7266,
-            2.1738,
-            1.9600,
-            1.7686,
-            1.6396,
-            2.1465,
-            1.2188,
-            1.2002,
-            2.1113,
-            1.7227,
-            1.5811,
-            1.7598,
-            2.2773,
-            1.8936,
-            1.4102,
-            1.5801,
-            1.7734,
-            2.0684,
-            2.1621,
-            1.8027,
-            1.1045,
-            1.9648,
-            2.2402,
-            2.0742,
-            1.3330,
-            1.5840,
-            2.1465,
-            2.0176,
-            1.5068,
-            1.9834,
-            1.7725,
-            1.5527,
-            1.7793,
-            1.7744,
-            1.5312,
-            1.2695,
-            1.9209,
-            2.0469,
-            1.6777,
-            2.5195,
-            1.8389,
-            1.7598,
-            1.5498,
-            1.6797,
-            1.7324,
-            1.5928,
-            1.9258,
-            1.7734,
-            1.4463,
-            2.0391,
-            2.0508,
-            2.2129,
-            1.6787,
-            2.0586,
-            1.8975,
-            1.5713,
-            1.6992,
-            1.8770,
-            1.7207,
-            1.7070,
-            1.1602,
-            1.8584,
-            2.4570,
-            1.6016,
-            1.4834,
-            1.1777,
-            1.7959,
-            1.8955,
-            1.8906,
-            1.6738,
-            1.7510,
-            1.4316,
-            1.8330,
-            2.2461,
-            1.7744,
-            2.1934,
-            1.4824,
-            1.8828,
-            1.6387,
-            2.4629,
-            1.8887,
-            1.5137,
-            1.4648,
-            1.6406,
-            1.7178,
-            2.2637,
-            1.5801,
-            2.1484,
-            2.0605,
-            2.0098,
-            1.7539,
-            1.1768,
-            1.4375,
-            2.0723,
-            1.6162,
-            1.7832,
-            1.8291,
-            1.6064,
-            1.5215,
-            1.9297,
-            2.3750,
-            2.1504,
-            1.7061,
-            1.1289,
-            1.4473,
-            1.5674,
-            1.6836,
-            2.2930,
-            1.1221,
-            1.5547,
-            1.7559,
-            1.8281,
-            2.0703,
-            1.9443,
-            2.0684,
-            2.2988,
-            1.6348,
-            2.3379,
-            2.4414,
-            1.8857,
-            2.0020,
-            1.4834,
-            1.5488,
-            1.6514,
-            2.3711,
-            1.9941,
-            2.3047,
-            1.4277,
-            2.1777,
-            1.6445,
-            1.6025,
-            1.5938,
-            1.5508,
-            1.9502,
-            2.1309,
-            1.2666,
-            1.1514,
-            1.9551,
-            1.8584,
-            1.9746,
-            1.5986,
-            1.9688,
-            2.1953,
-            1.1514,
-            2.3262,
-            1.2451,
-            1.8447,
-            2.2051,
-            1.5254,
-            1.5342,
-            2.1016,
-            1.6523,
-            1.6279,
-            1.1680,
-            1.3037,
-            2.1035,
-        ]
-    ).to(torch.float16)
-    @parameterized.expand([(False,), (True,)])
-    def test_cuda_old(self, use_half2: bool):
-        group_size = 128
-        # test the 256 kernel (in_features % 256 == 0 and out_features % 256 == 0)
-        m = 1
-        k = 256
-        n = 256
-        device = "cuda"
-        linear_class = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=False,
-            group_size=group_size,
-            bits=4,
-            disable_exllama=True,
-            disable_exllamav2=True,
-        )
-        weight_dtype = torch.float16 if use_half2 else torch.float32
-        linear = linear_class(
-            bits=4,
-            group_size=group_size,
-            infeatures=k,
-            outfeatures=n,
-            bias=False,
-            weight_dtype=weight_dtype,
-        )
-        torch.manual_seed(42)
-        linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
-        linear.scales = linear.scales + 0.002
-        linear.use_cuda_fp16 = use_half2
-        self.assertTrue(linear.autogptq_cuda_available)
-        # We cast twice just for the seed.
-        inp = torch.rand(1, m, k, dtype=torch.float16).to(device).to(weight_dtype)
-        linear = linear.eval()
-        linear = linear.to(device)
-        with torch.no_grad():
-            res = linear(inp)[0][0]
-        if use_half2:
-            reference = self.REFERENCE_OLD_HALF.to(device).to(weight_dtype)
-        else:
-            reference = self.REFERENCE_OLD_NO_HALF.to(device).to(weight_dtype)
-        self.assertTrue(torch.allclose(res, reference, rtol=1e-3), get_diff(res, reference))
-    @parameterized.expand(
-        [
-            (torch.float32, "cpu"),
-            (torch.float32, "cuda:0"),
-            (torch.float16, "cuda:0"),
-        ]
-    )
-    def test_generation_with_act_order(self, torch_dtype, device):
-        prompt = "I am in Paris and"
-        # Reference generated with the cuda-old kernel
-        if device == "cpu":
-            # CPU implementation is extremely slow.
-            new_tokens = 2
-            reference_output = "<s> I am in Paris and it is"
-        else:
-            reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
-            new_tokens = 60
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device=device,
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            torch_dtype=torch_dtype,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        # This one uses Autocast.
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-        # This one does not.
-        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    @parameterized.expand(
-        [
-            (torch.float32, "cpu"),
-            (torch.float32, "cuda:0"),
-            (torch.float16, "cuda:0"),
-        ]
-    )
-    def test_generation_no_act_order(self, torch_dtype, device):
-        prompt = "I am in Paris and"
-        # Reference generated with the cuda-old kernel
-        if device == "cpu":
-            # CPU implementation is extremely slow.
-            new_tokens = 3
-            reference_output = "<s> I am in Paris and I am going"
-        else:
-            reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
-            new_tokens = 60
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            device=device,
-            use_triton=False,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            torch_dtype=torch_dtype,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        # This one uses Autocast.
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-        # This one does not.
-        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-class TestsQ4ExllamaV2(unittest.TestCase):
-    def test_exllamav2(self):
-        from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
-        group_size = 128
-        m = 1
-        k = 1024
-        n = 1024
-        device = torch.device("cuda:0")
-        linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
-        linear = linear_class(
-            bits=4,
-            group_size=group_size,
-            infeatures=k,
-            outfeatures=n,
-            bias=False,
-        )
-        self.assertTrue(isinstance(linear, QuantLinear))
-        torch.manual_seed(42)
-        linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
-        linear.scales = linear.scales + 0.002
-        linear = linear.eval()
-        linear = linear.to(device)
-        linear = autogptq_post_init(linear, use_act_order=False)
-        inp = torch.rand(1, m, k, dtype=torch.float16).to(device)
-        with torch.no_grad():
-            res = linear(inp)[0][0]
-        reference = CUDA_OLD_REFERENCE.to(device)
-        self.assertTrue(
-            torch.allclose(res, reference, rtol=3e-5, atol=2e-2),
-            get_diff(res, reference),
-        )
-    def test_generation_no_act_order(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
-        model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_triton=False)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_generation_with_act_order(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_exllama_v2_buffer_size(self):
-        # prompt = "I'm in Paris and" * 450
-        prompt = "I'm in Paris and" * 500
-        device = torch.device("cuda:0")
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        self.assertTrue(inp["input_ids"].shape[1] > 2048)  # 2048 is the default max_input_length for LLama
-        _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-class TestsMixtral(unittest.TestCase):
-    def test_mixtral_generation(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = """<s> I am in Paris andpublishedющиеcs performancesension manual offset亡VIDEO Kel RepubliczwDrawlichen LondresPSungspfn CreahooEESlider laughselvesлександTrytpl recallслу Ор coldsubset########serdeacion providestrm thoughts président oktobermulticol../редβ themselvesterraряд conflictscommandMass diagonal選 ptrTY還 Havepliedument relate redu"""
-        model_id = "TheBlokeAI/Mixtral-tiny-GPTQ"
-        model_basename = "model"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            use_safetensors=True,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60, do_sample=False)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-class TestQ4Marlin(unittest.TestCase):
-    def test_generation(self):
-        # Reference generated with the cuda-old kernel and TheBloke/Llama-2-7B-Chat-GPTQ
-        reference_output = "<s> I am in Paris and I am feeling very sad and lonely. everybody I know is busy and I don't have any friends here. I am staying in a small apartment in the 11th arrondissement and I am feeling very isolated. I miss my friends and family back home and I don'"
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
-        try:
-            model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_marlin=True)
-        except ValueError as e:
-            if torch.version.hip:
-                self.assertTrue("Can not use Marlin int4*fp16 kernel with AMD ROCm" in e.text)
-                self.skipTest("Can not run this test on ROCm")
-            else:
-                raise e
-        has_marlin = False
-        for _, module in model_q.named_modules():
-            if isinstance(module, MarlinQuantLinear):
-                has_marlin = True
-                break
-        self.assertTrue(has_marlin)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_bias(self):
-        # TheBloke/Llama-2-7B-Chat-GPTQ has bias, but they are all zeros, use a checkpoint which really uses bias.
-        model_id = "s3nh/starcoderbase-1b-GPTQ"
-        try:
-            model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_marlin=True)
-        except ValueError as e:
-            if torch.version.hip:
-                self.assertTrue("Can not use Marlin int4*fp16 kernel with AMD ROCm" in e.text)
-                self.skipTest("Can not run this test on ROCm")
-            else:
-                raise e
-        for _, param in model_q.named_parameters():
-            self.assertTrue(param.device != torch.device("meta"))
-        for _, param in model_q.named_buffers():
-            self.assertTrue(param.device != torch.device("meta"))
-        self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_proj.bias) > 0)
-        self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_attn.bias) > 0)
-        tokenizer = AutoTokenizer.from_pretrained("Xenova/starcoderbase-1b")
-        prompt = "Today I am in Paris and"
-        inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertTrue(predicted_text.startswith("Today I am in Paris and I am a student of the Master's"))
-class TestsQ4Triton(unittest.TestCase):
-    def test_generation_no_act_order(self):
-        prompt = "I am in Paris and"
-        reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
-        new_tokens = 60
-        model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            device="cuda:0",
-            use_triton=False,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            torch_dtype=torch.float16,
-            use_tritonv2=True,
-        )
-        for _, submodule in model_q.named_modules():
-            if isinstance(submodule, TritonV2QuantLinear):
-                break
-        else:
-            raise ValueError("Did not find a tritonv2 linear layer")
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
-        # This one uses Autocast.
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-        # This one does not.
-        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    def test_generation_with_act_order(self):
-        prompt = "I am in Paris and"
-        device = torch.device("cuda:0")
-        # Reference generated with the cuda-old kernel
-        reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
-        model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
-        revision = "actorder"
-        model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
-        model_q = AutoGPTQForCausalLM.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            use_triton=False,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-            model_basename=model_basename,
-            disable_exllama=True,
-            disable_exllamav2=True,
-            use_tritonv2=True,
-        )
-        for _, submodule in model_q.named_modules():
-            if isinstance(submodule, TritonV2QuantLinear):
-                break
-        else:
-            raise ValueError("Did not find a tritonv2 linear layer")
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-class TestQ4HPU(unittest.TestCase):
-    @parameterized.expand(
-        [
-            ("hpu", torch.bfloat16),
-            ("hpu", torch.float),
-        ]
-    )
-    def test_generation(self, in_device, model_dtype):
-        try:
-            import habana_frameworks.torch.core as htcore
-        except Exception as e:
-            self.skipTest("Couldn't import HPU plugin, skipping HPU tests")
-        # Reference generated with the cuda-old kernel and TheBloke/Llama-2-7B-Chat-GPTQ
-        reference_output = "<s> I am in Paris and I am feeling very sad and lonely. everybody I know is busy and I don't have any friends here. I am staying in a small apartment in the 11th arrondissement and I am feeling very isolated. I miss my friends and family back home and I don'"
-        prompt = "I am in Paris and"
-        device = torch.device(in_device)
-        model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
-        try:
-            from transformers import GPTQConfig, AutoModelForCausalLM
-            quantization_config = GPTQConfig(bits=4, use_exllama=False)
-            model_kwargs = {
-                "revision": "main",
-                "token": None
-            }
-            model_q = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs)
-            model_q = model_q.eval().to(device)
-        except ValueError as e:
-            if torch.version.hip:
-                self.assertTrue("Can not use HPU int4 kernel" in e.text)
-                self.skipTest("Can not run this test on HPU")
-            else:
-                raise e
-        tokenizer_kwargs = {
-            "revision": "main",
-            "token": None
-        }
-        tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_kwargs)
-        if not model_q.config.is_encoder_decoder:
-            tokenizer.padding_side = "left"
-        # Some models like GPT2 do not have a PAD token so we have to set it if necessary
-        if model_q.config.model_type == "llama":
-            # unwind broken decapoda-research config
-            model_q.generation_config.pad_token_id = 0
-            model_q.generation_config.bos_token_id = 1
-            model_q.generation_config.eos_token_id = 2
-            tokenizer.bos_token_id = model_q.generation_config.bos_token_id
-            tokenizer.eos_token_id = model_q.generation_config.eos_token_id
-            tokenizer.pad_token_id = model_q.generation_config.pad_token_id
-            tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
-            tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
-            tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            model_q.generation_config.pad_token_id = model_q.generation_config.eos_token_id
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertEqual(predicted_text, reference_output)
-    @parameterized.expand(
-        [
-            ("hpu", torch.bfloat16),
-            ("hpu", torch.float),
-        ]
-    )
-    def test_bias(self, in_device, model_dtype):
-        try:
-            import habana_frameworks.torch.core as htcore
-        except Exception as e:
-            self.skipTest("Couldn't import HPU plugin, skipping HPU tests")
-        device = torch.device(in_device)
-        # TheBloke/Llama-2-7B-Chat-GPTQ has bias, but they are all zeros, use a checkpoint which really uses bias.
-        model_id = "s3nh/starcoderbase-1b-GPTQ"
-        try:
-            model_kwargs = {
-                "revision": "main",
-                "token": None
-            }
-            model_q = AutoGPTQForCausalLM.from_quantized(model_id, torch_dtype=model_dtype, use_marlin=False, **model_kwargs)
-            model_q = model_q.eval().to(device)
-        except ValueError as e:
-            if torch.version.hip:
-                self.assertTrue("Can not use HPU int4 kernel" in e.text)
-                self.skipTest("Can not run this test on HPU")
-            else:
-                raise e
-        for _, param in model_q.named_parameters():
-            self.assertTrue(param.device != torch.device("meta"))
-        for _, param in model_q.named_buffers():
-            self.assertTrue(param.device != torch.device("meta"))
-        self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_proj.bias) > 0)
-        self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_attn.bias) > 0)
-        tokenizer_kwargs = {
-            "revision": "main",
-            "token": None
-        }
-        tokenizer = AutoTokenizer.from_pretrained("Xenova/starcoderbase-1b", **tokenizer_kwargs)
-        if not model_q.config.is_encoder_decoder:
-            tokenizer.padding_side = "left"
-        # Some models like GPT2 do not have a PAD token so we have to set it if necessary
-        if model_q.config.model_type == "llama":
-            # unwind broken decapoda-research config
-            model_q.generation_config.pad_token_id = 0
-            model_q.generation_config.bos_token_id = 1
-            model_q.generation_config.eos_token_id = 2
-            tokenizer.bos_token_id = model_q.generation_config.bos_token_id
-            tokenizer.eos_token_id = model_q.generation_config.eos_token_id
-            tokenizer.pad_token_id = model_q.generation_config.pad_token_id
-            tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
-            tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
-            tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-            model_q.generation_config.pad_token_id = model_q.generation_config.eos_token_id
-        prompt = "Today I am in Paris and"
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
-        predicted_text = tokenizer.decode(res[0])
-        self.assertTrue(predicted_text.startswith("Today I am in Paris and I am a student of the Master's"))
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
-import os
-import tempfile
-import unittest
-import torch.cuda
-from parameterized import parameterized
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.quantization import CHECKPOINT_FORMAT, QUANT_CONFIG_FILENAME, BaseQuantizeConfig
-class TestQuantization(unittest.TestCase):
-    @parameterized.expand([(False,), (True,)])
-    def test_quantize(self, use_marlin: bool):
-        pretrained_model_dir = "saibo/llama-1B"
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-        examples = [
-            tokenizer(
-                "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
-            ),
-            tokenizer(
-                "Today I am in Paris and it is a wonderful day."
-            ),
-        ]
-        quantize_config = BaseQuantizeConfig(
-            bits=4,
-            group_size=128,
-            desc_act=False,
-            checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else CHECKPOINT_FORMAT.GPTQ,
-        )
-        model = AutoGPTQForCausalLM.from_pretrained(
-            pretrained_model_dir,
-            quantize_config=quantize_config,
-            use_flash_attention_2=False,
-        )
-        model.quantize(examples)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", use_marlin=use_marlin)
-            del model
-            torch.cuda.empty_cache()
-            # test compat: 1) with simple dict type 2) is_marlin_format
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "desc_act": False,
-                "is_marlin_format": use_marlin,
-            }
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", quantize_config=compat_quantize_config)
-            assert(isinstance(model.quantize_config, BaseQuantizeConfig))
-            del model
-            torch.cuda.empty_cache()
-            # test checkinpoint_format hint to from_quantized()
-            os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
-            compat_quantize_config = {
-                "bits": 4,
-                "group_size": 128,
-                "desc_act": False,
-            }
-            model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0",
-                    quantize_config=compat_quantize_config,
-                    checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else None)
-            assert (isinstance(model.quantize_config, BaseQuantizeConfig))
--- a/tests/test_repacking.py
+++ b/tests/test_repacking.py
-import copy
-import unittest
-import autogptq_marlin_cuda
-import torch
-import torch.nn as nn
-from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
-from auto_gptq.nn_modules.qlinear.qlinear_marlin import _get_perms, dequantize_weight
-def gen_quant4(k, n, groupsize=-1):
-    maxq = 2 ** 4 - 1
-    w = torch.randn((k, n), dtype=torch.half, device="cpu")
-    original_w = w.clone()
-    if groupsize != -1:
-        w = w.reshape((-1, groupsize, n))
-        w = w.permute(1, 0, 2)
-        w = w.reshape((groupsize, -1))
-    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
-    s *= 2 / maxq
-    # Quantize.
-    w = torch.round(w / s).int()
-    # Unsigned storage.
-    w += (maxq + 1) // 2
-    w = torch.clamp(w, 0, maxq)
-    # Dequantize.
-    ref = (w - (maxq + 1) // 2).half() * s
-    if groupsize != -1:
-        def reshape(w):
-            w = w.reshape((groupsize, -1, n))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((k, n)).contiguous()
-            return w
-        ref = reshape(ref)
-        w = reshape(w)
-    s = s.reshape((-1, n)).contiguous()
-    linear = nn.Linear(k, n, bias=False)
-    linear.weight.data = ref.t()
-    return original_w, linear, s
-original_w, linear, s = gen_quant4(64, 128)
-class TestRepacking(unittest.TestCase):
-    def test_marlin_fast_repacking(self):
-        k = 2048
-        n = 1024
-        m = 5
-        group_size = 128
-        _, linear, s = gen_quant4(k, n, group_size)
-        cuda_old_linear = CudaOldQuantLinear(bits=4, group_size=group_size, infeatures=k, outfeatures=n, bias=False)
-        zeros = torch.full((k // group_size, n), 8, dtype=torch.int32)
-        cuda_old_linear.pack(linear, s.T, zeros.T, g_idx=None)
-        # Adapted from utils.marlin_utils.convert_to_marlin
-        dequantized_weight, dequantized_qzeros = dequantize_weight(cuda_old_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-        linear_module = torch.nn.Linear(
-            in_features=k,
-            out_features=n,
-            bias=False,
-            dtype=torch.float16,
-            device="cuda",
-        )
-        linear_module.weight.data.copy_(linear.weight.data)  # Not using dequantized_weight to avoid approx
-        # Create new linear method and copy to model.
-        marlin_linear = MarlinQuantLinear(
-            bits=4,
-            group_size=group_size,
-            infeatures=k,
-            outfeatures=n,
-            bias=False,
-            trainable=False,
-        )
-        marlin_linear.pack(linear_module.to("cuda"), scales=copy.deepcopy(cuda_old_linear.scales.data.t()).to("cuda"))
-        inp = torch.rand(m, k, dtype=torch.float16, device="cuda")
-        cuda_old_linear = cuda_old_linear.to("cuda")
-        marlin_linear = marlin_linear.to("cuda")
-        with torch.no_grad():
-            res_cuda_old = cuda_old_linear(inp)
-            res_marlin = marlin_linear(inp)
-        reldiff = (res_cuda_old - res_marlin).abs() / (res_cuda_old.abs() + 1e-12)
-        self.assertTrue(torch.mean(reldiff) < 4e-3)
-        weight_repacked = autogptq_marlin_cuda.gptq_repack(cuda_old_linear.qweight)
-        self.assertTrue(torch.allclose(weight_repacked, marlin_linear.B))
-        _, _scale_perm, _scale_perm_single = _get_perms()
-        s = cuda_old_linear.scales.data.clone()
-        if group_size != k:
-            s = s.reshape((1, -1))
-            s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
-        else:
-            s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
-        s = s.reshape((-1, n)).contiguous()
-        self.assertTrue(torch.allclose(s, marlin_linear.s))
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
-import json
-import os
-import tempfile
-import time
-import unittest
-from auto_gptq import AutoGPTQForCausalLM
-from auto_gptq.quantization import CHECKPOINT_FORMAT, CHECKPOINT_FORMAT_FIELD, QUANT_CONFIG_FILENAME
-from auto_gptq.quantization.config import QUANT_METHOD, BaseQuantizeConfig
-class TestSerialization(unittest.TestCase):
-    MODEL_ID = "habanoz/TinyLlama-1.1B-Chat-v0.3-GPTQ"
-    def setUp(self):
-        dummy_config = BaseQuantizeConfig(
-            model_name_or_path=self.MODEL_ID,
-            quant_method=QUANT_METHOD.GPTQ,
-            checkpoint_format=CHECKPOINT_FORMAT.MARLIN)
-        model_cache_path, is_cached = dummy_config.get_cache_file_path()
-        if is_cached:
-            os.remove(model_cache_path)
-    def test_marlin_local_serialization(self):
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        end = time.time()
-        first_load_time = end - start
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "model.safetensors")))
-            model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
-            self.assertFalse(os.path.isfile(os.path.join(tmpdir, model_cache_path)))
-            with open(os.path.join(tmpdir, QUANT_CONFIG_FILENAME), "r") as config_file:
-                config = json.load(config_file)
-            self.assertTrue(config[CHECKPOINT_FORMAT_FIELD] == CHECKPOINT_FORMAT.MARLIN)
-            start = time.time()
-            model = AutoGPTQForCausalLM.from_quantized(tmpdir, device="cuda:0", use_marlin=True)
-            end = time.time()
-            second_load_time = end - start
-        # Since we use a CUDA kernel to repack weights, the first load time is already small.
-        self.assertTrue(second_load_time < first_load_time)
-    def test_marlin_hf_cache_serialization(self):
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
-        end = time.time()
-        first_load_time = end - start
-        model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
-        self.assertTrue("assets" in model_cache_path)
-        self.assertTrue(is_cached)
-        start = time.time()
-        model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
-        self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
-        end = time.time()
-        second_load_time = end - start
-        # Since we use a CUDA kernel to repack weights, the first load time is already small.
-        self.assertTrue(second_load_time < first_load_time)
--- a/tests/test_sharded_loading.py
+++ b/tests/test_sharded_loading.py
-import unittest
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-class TestShardedLoading(unittest.TestCase):
-    def test_loading(self):
-        model_name = "TheBlokeAI/llama-68m-GPTQ-sharded"
-        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
-        model = AutoGPTQForCausalLM.from_quantized(model_name, device='cuda:0',)
-        tokens = model.generate(**tokenizer("1337", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
-        result = tokenizer.decode(tokens)
-        self.assertTrue(result == '<s> 133777777777777777777777')
-    def test_loading_large(self):
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4")
-        model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4", device='cuda:0')
-        tokens = model.generate(**tokenizer("Today I am in Paris and", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
-        result = tokenizer.decode(tokens)
-        self.assertTrue(result == 'Today I am in Paris and I am going to the Louvre Museum. I want to see the Mona Lisa painting, but I')
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
-import os
-import unittest
-import torch
-import torch.utils.benchmark as benchmark
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-MODEL_ID = "TheBloke/Llama-7B-GPTQ"
-DATASET_ID = "timdettmers/openassistant-guanaco"
-LEARNING_RATE = 3e-5
-MAX_SEQ_LEN = 10
-BATCH_SIZE = 5
-NUM_TRAIN_STEPS = 10
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def benchmark_forward(
-    fn,
-    *inputs,
-    repeats="auto",
-    desc="",
-    verbose=True,
-    amp=False,
-    amp_dtype=torch.float16,
-    **kwinputs,
-):
-    if verbose:
-        print(desc, "- Forward pass")
-    def amp_wrapper(*inputs, **kwinputs):
-        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
-            fn(*inputs, **kwinputs)
-    t = benchmark.Timer(
-        stmt="fn_amp(*inputs, **kwinputs)",
-        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
-        num_threads=torch.get_num_threads(),
-    )
-    if repeats == "auto":
-        m = t.blocked_autorange()
-    else:
-        m = t.timeit(repeats)
-    if verbose:
-        print(m)
-    return t, m
-def get_model_and_tokenizer(
-    model_id=MODEL_ID,
-    inject_fused_attention=False,
-    inject_fused_mlp=False,
-    **model_kwargs,
-):
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_ID,
-        use_fast=True,
-    )
-    if not tokenizer.pad_token_id:
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-    model = AutoGPTQForCausalLM.from_quantized(
-        model_id,
-        trainable=True,
-        inject_fused_attention=inject_fused_attention,
-        inject_fused_mlp=inject_fused_mlp,
-        disable_exllamav2=True,
-        disable_exllama=True,
-        **model_kwargs,
-    )
-    model.warmup_triton()
-    return model, tokenizer
-class TestTriton(unittest.TestCase):
-    def test_triton_qlinear(self):
-        ref_model, _ = get_model_and_tokenizer(
-            model_id=MODEL_ID,
-            use_triton=True,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-        )
-        test_model, _ = get_model_and_tokenizer(
-            model_id=MODEL_ID,
-            use_tritonv2=True,
-            inject_fused_attention=False,
-            inject_fused_mlp=False,
-        )
-        hidden_size = ref_model.model.model.embed_tokens.weight.shape[1]
-        test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda()
-        qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
-        qlinear_test = test_model.model.model.layers[0].self_attn.q_proj
-        test_out = qlinear_test(test_data)
-        ref_out = qlinear_ref(test_data)
-        self.assertTrue(torch.allclose(test_out, ref_out))
-        _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)
-        _, measure_tritonv2 = benchmark_forward(qlinear_test, test_data, desc="Triton-v2", verbose=True)
-        self.assertTrue(measure_tritonv2.mean < measure_triton.mean)