Commit f8070792 authored by yangql's avatar yangql
Browse files

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files
parent a2630e0f
import torch
try:
from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this benchmark. {e}"
)
import numpy as np
from auto_gptq.modeling._utils import autogptq_post_init
from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
group_size = 128
bits = 4
# Yi 34B down_proj
k = 20480
n = 7168
device = torch.device("cuda:0")
linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
linear_gptq = linear_class(
bits=bits,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
)
assert isinstance(linear_gptq, QuantLinear)
linear_gptq = linear_gptq.eval()
linear_gptq = linear_gptq.to(device)
linear_gptq = autogptq_post_init(linear_gptq, use_act_order=False)
num_runs = 60
lines = []
seqlens = [
1,
2,
3,
4,
5,
6,
7,
8,
12,
16,
24,
32,
48,
64,
80,
120,
250,
512,
1024,
2048,
4000,
8000,
]
print(f"in_features={k}, out_features={n}")
for query_length in seqlens:
# batch_size, query_length, hidden_size
inp = torch.rand(1, query_length, k, dtype=torch.float16).to(device)
torch.cuda.empty_cache()
# Warmup Exllama v2
with torch.no_grad():
res = linear_gptq(inp)
latencies = []
torch.cuda.synchronize()
for _ in range(num_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
res = linear_gptq(inp)
end_event.record()
torch.cuda.synchronize()
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
# print("-------")
# print(f"Latency GPTQ Exllama v2 (query_length={query_length}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
exllamav2_mean_latency = np.mean(latencies)
exllamav2_p10 = np.percentile(latencies, 10)
exllamav2_p90 = np.percentile(latencies, 90)
torch.cuda.empty_cache()
total_seqlen = inp.shape[:-1].numel()
if total_seqlen <= 8:
awq_kernel = "GEMV"
linear_awq = WQLinear_GEMV(
w_bit=bits,
group_size=group_size,
in_features=k,
out_features=n,
bias=False,
dev=device,
)
else:
awq_kernel = "GEMM"
linear_awq = WQLinear_GEMM(
w_bit=bits,
group_size=group_size,
in_features=k,
out_features=n,
bias=False,
dev=device,
)
# Warmup AWQ
with torch.no_grad():
res = linear_awq(inp)
latencies = []
torch.cuda.synchronize()
for _ in range(num_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
res = linear_awq(inp)
end_event.record()
torch.cuda.synchronize()
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
awq_mean_latency = np.mean(latencies)
awq_p10 = np.percentile(latencies, 10)
awq_p90 = np.percentile(latencies, 90)
exllama_speedup = awq_mean_latency / exllamav2_mean_latency
# print(f"Latency AWQ (query_length={query_length}, kernel={awq_kernel}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
line = "{},{},{},{},{},{},{},{},{},{},{}".format(
bits,
group_size,
total_seqlen,
awq_kernel,
f"{awq_mean_latency:.3f}",
f"{exllamav2_mean_latency:.3f}",
f"{awq_p10:.3f}",
f"{awq_p90:.3f}",
f"{exllamav2_p10:.3f}",
f"{exllamav2_p90:.3f}",
f"{exllama_speedup:.3f}",
)
lines.append(line)
header = "bits, group_size, total_seqlen, awq_kernel, awq_mean_latency (ms), exllamav2_mean_latency (ms), awq_p10, awq_p90, exllamav2_p10, exllamav2_p90, exllama_speedup"
print(header)
for line in lines:
print(line)
[pytest]
addopts=-s -v
log_cli=true
# ruff: noqa: I001
import unittest
import torch
import autogptq_cuda_64
import autogptq_cuda_256
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQLinear
try:
from awq import AutoAWQForCausalLM
except ModuleNotFoundError as e:
AutoAWQForCausalLM = None
AWQ_EXCEPTION = e
class TestAwqCompatibility(unittest.TestCase):
# TODO: test cuda-old fp16.
# TODO: test cuda-old fp32.
# TODO: test exllama v2.
def test_generation_cuda_old_fp32_pytorch(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch.float32,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_cuda_old_cuda_256(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
for torch_dtype in [torch.float16, torch.float32]:
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
for name, module in model_autogptq.named_modules():
if isinstance(module, CudaOldQLinear):
self.assertTrue(module.autogptq_cuda == autogptq_cuda_256)
if torch_dtype == torch.float32:
self.assertFalse(module.use_cuda_fp16)
else:
self.assertTrue(module.use_cuda_fp16)
inp = tokenizer(prompt, return_tensors="pt").to(device)
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_cuda_old_cuda_64(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
for torch_dtype in [torch.float16, torch.float32]:
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
# Force autogptq_cuda_64.
for name, module in model_autogptq.named_modules():
if isinstance(module, CudaOldQLinear):
if module.autogptq_cuda != autogptq_cuda_64:
module.autogptq_cuda = autogptq_cuda_64
if torch_dtype == torch.float32:
self.assertFalse(module.use_cuda_fp16)
else:
self.assertTrue(module.use_cuda_fp16)
inp = tokenizer(prompt, return_tensors="pt").to(device)
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_exllama(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=False,
disable_exllamav2=True,
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_exllamav2(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
import numpy as np
import math
import torch
import pytest
try:
import habana_frameworks.torch.core as htcore
except Exception as e:
pytestmark = pytest.mark.skip("Couldn't import HPU plugin, skipping HPU tests")
def _convert_to_tensor_list(tensor_or_tensors):
if isinstance(tensor_or_tensors, tuple):
return list(tensor_or_tensors)
elif isinstance(tensor_or_tensors, list):
return tensor_or_tensors
elif isinstance(tensor_or_tensors, torch.Tensor):
# You can't return list(tensor_or_tensors), because it will fail on 0-d tensors
result_list = []
result_list.append(tensor_or_tensors)
return result_list
else:
raise TypeError("Can not convert outputs")
def compare_tensors(hpu_tensors, cpu_tensors, atol, rtol, assert_enable=True):
hpu_tensors = _convert_to_tensor_list(hpu_tensors)
cpu_tensors = _convert_to_tensor_list(cpu_tensors)
assert len(hpu_tensors) == len(cpu_tensors)
hpu_tensors = [tensor.to('cpu') if tensor is not None else tensor for tensor in hpu_tensors]
for i in range(len(hpu_tensors)):
if cpu_tensors[i] is None and hpu_tensors[i] is None:
continue
hpu_tensors[i] = (
hpu_tensors[i].float()
if hpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
else hpu_tensors[i]
)
cpu_tensors[i] = (
cpu_tensors[i].float()
if cpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
else cpu_tensors[i]
)
if assert_enable:
np.testing.assert_allclose(
hpu_tensors[i].detach().numpy(),
cpu_tensors[i].detach().numpy(),
atol=atol,
rtol=rtol,
)
else:
print("hpu_result[{}]".format(i), hpu_tensors[i].detach().numpy())
print("cpu_result[{}]".format(i), cpu_tensors[i].detach().numpy())
return np.allclose(
hpu_tensors[i].detach().numpy(),
cpu_tensors[i].detach().numpy(),
atol=atol,
rtol=rtol,
equal_nan=True,
)
# taken from AutoGPTQ/tests/test_repacking.py
def gen_quant4(k, n, groupsize=-1, bias=False):
maxq = 2 ** 4 - 1
w = torch.randn((k, n), dtype=torch.bfloat16, device="cpu")
original_w = w.clone()
if groupsize != -1:
w = w.reshape((-1, groupsize, n))
w = w.permute(1, 0, 2)
w = w.reshape((groupsize, -1))
s = torch.max(torch.abs(w), 0, keepdim=True)[0]
s *= 2 / maxq
# Quantize.
w = torch.round(w / s).int()
# Unsigned storage.
w += (maxq + 1) // 2
w = torch.clamp(w, 0, maxq)
# Dequantize.
ref = (w - (maxq + 1) // 2).bfloat16() * s
if groupsize != -1:
def reshape(w):
w = w.reshape((groupsize, -1, n))
w = w.permute(1, 0, 2)
w = w.reshape((k, n)).contiguous()
return w
ref = reshape(ref)
w = reshape(w)
s = s.reshape((-1, n)).contiguous()
linear = torch.nn.Linear(k, n, bias=bias)
linear.weight.data = ref.t()
return original_w, linear, s
@pytest.mark.parametrize("bits", [4])
@pytest.mark.parametrize("group_size", [16, 32, 128])
@pytest.mark.parametrize("infeatures", [64, 128, 512, 4096, 11008])
@pytest.mark.parametrize("outfeatures", [64, 128, 512, 4096, 11008])
@pytest.mark.parametrize("bias", [True, False], ids=["bias", "no_bias"])
@pytest.mark.parametrize("scales_value, weight_value, zeros_value", [("normal", "normal", "normal"), ("normal", "normal", "range"), ("normal", "normal", "zeros"), ("ones", "zeros", "zeros"), ("ones", "zeros", "eights"), ("ones", "range", "zeros"), ("ones", "range", "ones"), ("ones", "7", "ones"), ("ones", "zeros", "range"),("ones", "zeros", "ones"), ("ones", "range", "range"), ("range", "range", "range"), ("range", "range", "zeros")])
@pytest.mark.parametrize("weight_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
def test_qlinear_hpu(bits, group_size, infeatures, outfeatures, bias, scales_value, weight_value, zeros_value, weight_dtype):
qweight_shape_0 = infeatures // 32 * bits
qzeros_shape_0 = math.ceil(infeatures / group_size)
qzeros_shape_1 = outfeatures // 32 * bits
if qweight_shape_0 == 0 or qzeros_shape_0 == 0 or qzeros_shape_1 == 0:
pytest.skip(f"{qweight_shape_0=} == 0 or {qzeros_shape_0=} == 0 or {qzeros_shape_1=} == 0")
if infeatures < group_size:
pytest.skip(f"{infeatures=} < {group_size=}")
if infeatures != outfeatures:
pytest.skip(f"{infeatures=} != {outfeatures=}")
trainable = False
use_cuda_fp16 = False
kernel_switch_threshold = 128
from auto_gptq.nn_modules.qlinear import qlinear_hpu, qlinear_cuda_old
quant_hpu = qlinear_hpu.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
# Cuda old implementation is the reference, also runs on hpu
quant_ref_cuda_old = qlinear_cuda_old.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
input = torch.rand((infeatures, outfeatures), dtype=weight_dtype).to("hpu")
_, linear, s = gen_quant4(infeatures, outfeatures, group_size, bias)
if scales_value == "ones":
s = torch.ones_like(s)
if scales_value == "range":
range_t = torch.tensor(list(range(1, s.numel()+1)), dtype=torch.int32)
shape_s = s.shape
s = (torch.ones(s.numel()) * range_t).reshape(shape_s).contiguous()
if weight_value == "ones":
linear.weight = torch.nn.Parameter(torch.ones_like(linear.weight))
elif weight_value == "zeros":
linear.weight = torch.nn.Parameter(torch.zeros_like(linear.weight))
elif weight_value == "range":
shape_w = linear.weight.shape
weight_local = torch.ones(shape_w, dtype=torch.int32)
range_t_weight = torch.tensor(list(range(0, 8)), dtype=torch.int32)
linear.weight = torch.nn.Parameter((torch.ones(weight_local.numel(), dtype=linear.weight.dtype).reshape(-1, 8) * range_t_weight).reshape(shape_w).contiguous())
elif weight_value.isnumeric():
linear.weight = torch.nn.Parameter(torch.full_like(linear.weight, int(weight_value)))
linear.weight = torch.nn.Parameter(linear.weight.to(weight_dtype))
if zeros_value == "zeros":
zeros = torch.full((infeatures // group_size, outfeatures), 0, dtype=torch.int32)
elif zeros_value == "range":
zeros = torch.ones((infeatures // group_size, outfeatures), dtype=torch.int32)
range_t_zeros = torch.tensor(list(range(1, 9)), dtype=torch.int32)
shape_z = zeros.shape
zeros = (torch.ones(zeros.numel(), dtype=torch.int32).reshape(-1, 8) * range_t_zeros).reshape(shape_z).contiguous()
elif zeros_value == "eights":
zeros = torch.full((infeatures // group_size, outfeatures), 8, dtype=torch.int32)
else:
zeros = torch.full((infeatures // group_size, outfeatures), 1, dtype=torch.int32)
htcore.mark_step()
quant_ref_cuda_old.pack(linear, s.clone().detach().T, zeros.clone().detach().T, g_idx=None)
htcore.mark_step()
quant_ref_cuda_old.to("hpu")
#TODO: pack independently
quant_hpu.set_packed(quant_ref_cuda_old)
htcore.mark_step()
quant_hpu.to("hpu")
out_ref_cuda_old = quant_ref_cuda_old(input)
htcore.mark_step()
quant_hpu.post_init()
htcore.mark_step()
out_hpu = quant_hpu(input)
htcore.mark_step()
out_ref_cuda_old = out_ref_cuda_old.cpu()
out_hpu = out_hpu.cpu()
compare_tensors(out_hpu.cpu(), out_ref_cuda_old.cpu(), rtol = 1e-05, atol = 1e-08)
import math
from unittest import TestCase
import torch.cuda.amp
from peft import TaskType
from peft.peft_model import PeftModelForCausalLM
from torch.optim import Adam
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.utils.peft_utils import (
GPTQAdaLoraConfig,
GPTQLoraConfig,
GPTQLoraLinear,
GPTQSVDLinear,
get_gptq_peft_model,
)
MODEL_NAME = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
class TestPeftConversion(TestCase):
def check_model_trainable(self, model_lora: PeftModelForCausalLM, tokenizer: AutoTokenizer) -> None:
batch = tokenizer("Hello, world", return_tensors="pt")
batch = {key: value.to(model_lora.device) for key, value in batch.items()}
batch["labels"] = batch["input_ids"]
batch["attention_mask"] = batch["attention_mask"].float()
batch["attention_mask"].requires_grad = True
model_lora.gradient_checkpointing_enable()
optimizer = Adam(model_lora.parameters(), lr=1e-4)
model_lora.train()
losses = []
for _ in range(30):
optimizer.zero_grad()
with torch.cuda.amp.autocast():
loss = model_lora(**batch).loss
losses.append(loss.item())
loss.backward()
optimizer.step()
self.assertTrue(losses[0] > losses[-1])
self.assertTrue(all(math.isfinite(loss) for loss in losses))
self.assertTrue(not any(math.isnan(loss) for loss in losses))
def test_lora_conversion(self):
model = AutoGPTQForCausalLM.from_quantized(
MODEL_NAME,
use_triton=False,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False,
use_safetensors=True,
)
peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
target_modules=["qkv_proj"],
)
model_lora = get_gptq_peft_model(
model,
peft_config,
adapter_name="test",
auto_find_all_linears=False,
train_mode=True,
)
linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
self.assertTrue(isinstance(linear_layer, GPTQLoraLinear))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
self.check_model_trainable(model_lora, tokenizer)
def test_adalora_conversion(self):
model = AutoGPTQForCausalLM.from_quantized(
MODEL_NAME,
use_triton=False,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False,
use_safetensors=True,
)
peft_config = GPTQAdaLoraConfig(
init_r=20,
target_r=16,
beta1=0.85,
beta2=0.85,
tinit=200,
tfinal=1000,
deltaT=10,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
target_modules=["qkv_proj"],
)
model_lora = get_gptq_peft_model(
model,
peft_config,
adapter_name="test",
auto_find_all_linears=False,
train_mode=True,
)
linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
self.assertTrue(isinstance(linear_layer, GPTQSVDLinear))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
self.check_model_trainable(model_lora, tokenizer)
import unittest
import torch
from parameterized import parameterized
from auto_gptq.nn_modules.qlinear.qlinear_exllama import QuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
try:
from exllama_kernels import prepare_buffers, set_tuning_params
except ImportError as e:
print(f"[WARNING] Could not load exllama_kernels: {e}")
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, exllama_set_max_input_length
from auto_gptq.modeling._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
from auto_gptq.modeling._utils import autogptq_post_init
def get_diff(a, ref):
eps = 1e-6
return f"Maxdiff: {(a - ref).abs().max()}, Mean relative diff: {((a - ref).abs() / (ref.abs() + eps)).mean()}"
CUDA_OLD_REFERENCE = torch.Tensor(
[
5.8398,
6.8555,
7.2734,
6.4219,
6.2070,
5.8203,
6.5664,
6.4219,
6.2148,
5.3281,
5.7578,
7.5312,
8.1016,
6.1133,
7.2031,
6.6484,
6.5156,
6.0117,
6.0312,
6.1914,
6.2109,
6.8125,
5.8125,
7.1172,
7.3125,
6.7305,
5.9961,
6.5117,
6.1914,
5.9648,
7.1680,
6.4766,
7.2070,
6.5469,
6.7734,
6.4219,
6.8086,
7.0469,
5.9297,
6.4727,
6.2539,
5.9570,
7.2383,
5.8945,
6.0820,
5.7969,
7.1094,
6.2188,
6.7500,
7.3555,
6.2930,
6.7734,
5.9219,
7.4805,
6.8750,
6.4102,
6.5898,
6.5469,
7.6016,
6.7461,
5.9492,
7.2227,
5.8164,
5.4570,
6.2930,
7.3984,
6.0938,
7.3984,
5.9609,
6.3516,
6.5664,
5.7969,
7.1250,
6.0781,
6.7930,
5.9492,
6.1641,
6.5898,
6.0586,
6.3359,
6.7930,
7.0469,
6.0664,
6.3320,
5.4414,
6.7617,
5.1641,
7.2891,
6.8516,
6.5312,
5.6914,
7.3711,
6.8203,
5.9492,
7.0781,
6.3164,
7.1992,
7.1133,
7.4219,
7.5586,
7.1836,
6.9102,
6.4844,
6.9805,
6.1953,
6.5156,
5.4844,
6.6602,
6.6719,
7.9844,
6.4727,
6.6367,
6.2227,
6.4531,
5.0625,
6.4609,
6.7031,
6.6445,
6.5234,
6.8633,
6.6055,
5.6055,
6.4453,
7.2617,
6.3945,
6.6367,
6.1055,
7.0664,
6.0820,
6.6875,
6.1445,
6.8672,
6.2070,
6.8828,
6.1484,
6.7070,
6.8516,
6.2734,
7.1055,
7.0586,
6.9648,
5.9727,
6.1016,
6.8750,
7.0078,
7.1523,
5.7383,
5.9531,
6.5508,
7.5352,
6.1602,
6.2578,
6.3906,
5.7383,
6.7031,
5.7344,
6.3516,
5.2852,
7.5312,
6.4531,
6.6406,
6.2266,
6.1094,
5.9102,
5.7617,
6.3789,
7.0508,
6.3750,
6.3320,
6.8555,
6.7266,
7.0352,
7.7695,
6.3984,
6.5039,
6.8320,
6.1602,
6.0312,
6.3828,
6.9023,
7.4336,
7.3711,
6.1016,
7.0703,
6.3281,
6.8281,
6.4922,
5.9453,
5.1016,
6.7188,
6.1406,
6.6289,
7.2695,
6.2070,
6.7070,
7.2930,
7.1836,
6.3828,
6.1992,
6.7070,
7.8008,
7.7773,
5.6602,
7.0273,
6.6172,
6.0898,
5.3516,
7.3359,
5.9727,
6.0078,
7.0586,
6.3086,
6.8555,
7.2617,
7.3477,
6.3828,
7.1133,
6.6328,
7.3516,
6.9141,
7.2031,
6.9805,
6.1719,
6.7812,
8.3047,
6.5898,
6.3633,
6.2539,
7.2773,
6.5938,
6.4141,
6.8203,
6.8906,
7.8828,
5.9609,
6.4180,
7.3984,
5.7539,
7.1758,
6.6641,
6.9062,
6.2578,
7.5508,
6.1719,
6.5742,
5.9375,
6.7891,
6.2109,
6.5039,
6.8750,
6.2031,
6.8828,
7.1094,
5.9570,
7.2969,
6.6797,
6.8828,
5.5430,
6.9648,
5.8398,
6.5430,
6.3945,
6.5664,
5.8086,
6.6172,
7.0586,
6.8867,
6.0820,
5.8125,
6.7070,
7.5742,
6.2578,
6.1328,
6.5391,
5.4531,
6.8242,
6.6953,
6.8008,
6.3398,
6.4805,
7.2266,
6.3281,
6.6875,
6.4688,
5.9414,
7.4297,
5.8711,
6.0625,
5.8750,
6.5664,
5.8867,
6.3477,
6.1133,
6.9453,
5.0547,
6.7812,
6.4922,
7.2422,
5.4688,
6.2109,
7.2148,
6.1758,
5.9297,
7.1953,
5.5195,
6.3203,
5.9961,
7.9297,
6.2695,
6.4414,
6.7266,
7.1875,
7.3203,
5.4062,
6.0625,
7.0898,
5.3828,
5.6133,
6.0742,
6.6836,
5.7109,
7.2852,
7.7539,
7.5820,
6.4258,
5.9336,
6.3750,
6.3555,
7.5469,
6.2539,
6.5898,
6.4102,
7.0469,
5.7344,
7.2031,
6.7969,
5.6836,
7.6523,
6.9297,
7.8672,
6.4766,
6.3008,
7.0977,
6.5430,
7.0938,
5.8398,
6.9883,
6.5312,
6.3203,
6.3594,
5.4062,
6.9688,
5.7930,
6.3164,
6.5547,
7.1992,
5.8750,
6.3008,
6.7930,
6.0391,
7.4766,
6.6094,
6.5625,
5.9805,
6.2422,
7.2109,
6.6875,
5.3047,
7.6211,
5.9453,
6.5625,
6.1641,
6.1250,
6.5977,
7.7422,
7.0742,
5.6875,
6.2656,
6.6250,
6.8945,
5.7070,
6.3203,
5.7500,
6.2695,
6.2773,
6.8516,
6.4883,
7.0000,
6.7578,
6.1875,
5.9844,
5.5703,
6.7188,
5.5273,
5.3438,
7.2500,
6.7852,
6.5195,
6.8125,
6.0664,
6.7852,
7.0000,
7.0781,
6.8477,
7.2930,
6.3438,
7.1523,
6.3281,
6.8047,
7.3203,
5.3359,
6.1484,
6.5586,
7.3828,
6.2344,
7.1523,
6.4102,
5.5898,
7.0195,
7.1172,
5.8008,
6.5742,
6.2891,
8.0312,
6.9023,
6.5898,
7.1953,
6.7266,
6.0078,
5.5430,
6.4766,
6.4258,
5.9648,
8.0859,
5.0547,
7.2188,
7.4375,
6.5156,
5.9922,
6.3281,
6.2852,
6.7734,
6.2461,
6.9805,
5.4648,
5.8867,
6.8242,
6.3008,
6.3281,
7.3047,
7.1836,
6.5195,
6.6328,
6.7188,
5.4336,
6.5078,
5.3477,
5.5508,
7.3125,
5.8750,
6.5195,
6.2383,
6.3594,
6.0898,
6.4141,
5.9844,
6.6250,
7.7109,
6.0391,
7.2344,
5.9453,
5.9453,
7.0586,
5.6641,
7.2773,
6.5195,
7.2227,
6.3359,
5.3203,
6.4375,
7.2383,
6.4023,
6.2148,
7.3750,
5.8164,
6.2109,
6.5430,
5.8164,
6.1680,
6.7656,
6.0820,
6.1094,
6.5312,
6.8906,
6.8320,
6.1289,
6.3125,
7.6797,
6.3008,
6.0000,
7.3320,
6.7852,
6.9297,
6.6328,
6.2266,
5.1602,
6.2031,
7.0547,
5.9492,
6.0703,
6.0977,
6.8086,
6.0742,
6.0195,
7.0625,
6.5781,
5.7461,
6.1562,
7.0430,
6.7148,
6.5312,
6.5820,
6.4570,
7.5508,
5.6289,
6.0547,
6.5000,
7.3125,
5.8477,
5.9297,
6.2578,
6.0078,
5.9922,
7.3398,
7.4922,
7.8906,
7.5547,
5.4648,
6.5156,
6.3242,
6.1094,
6.9219,
6.7227,
6.6836,
7.4023,
5.9648,
7.2383,
6.7695,
6.6797,
7.0547,
6.3047,
6.4688,
6.9961,
6.0391,
5.9727,
6.8398,
6.7422,
5.7656,
5.4766,
6.7852,
7.0820,
5.3516,
7.6523,
5.1562,
6.6445,
6.1211,
6.2695,
6.0703,
6.3594,
6.4062,
6.3398,
5.7578,
6.5391,
6.2500,
6.5742,
6.5000,
7.5625,
7.0117,
6.5547,
7.1250,
6.4453,
6.6094,
6.1875,
6.4219,
6.6172,
6.4336,
6.5703,
6.1758,
6.4219,
6.6016,
6.7383,
6.7070,
6.1328,
5.5586,
6.6367,
6.3789,
6.2578,
5.5039,
6.6172,
6.4648,
5.8086,
7.2031,
5.8125,
6.3711,
7.6758,
7.1289,
5.8086,
6.3008,
6.2109,
6.1602,
6.1797,
7.2305,
6.7266,
6.2422,
5.6719,
6.7070,
6.9414,
6.8594,
7.4023,
7.2109,
6.0156,
6.6680,
6.6172,
7.1250,
6.6523,
6.9531,
6.7617,
6.4961,
6.9414,
5.7188,
7.6367,
6.5469,
6.2305,
6.4414,
7.4648,
5.9102,
6.2461,
6.1367,
6.8203,
6.5703,
6.8867,
7.0000,
6.7539,
6.1719,
6.5469,
6.2422,
5.4297,
5.7305,
5.1641,
6.1875,
7.0312,
6.6484,
6.0234,
7.4102,
6.8711,
6.3086,
6.3711,
6.7344,
6.6992,
5.9766,
7.3906,
7.1875,
6.4883,
6.3984,
7.3438,
6.9688,
6.9062,
6.4375,
6.7891,
7.0117,
6.4883,
5.7500,
7.0898,
7.0742,
6.7070,
5.8750,
6.0469,
6.6445,
5.2773,
6.8984,
6.1641,
7.0508,
7.4609,
5.0273,
6.7734,
6.4531,
5.7656,
6.5312,
7.4648,
6.1250,
6.5625,
7.1367,
6.0625,
6.1211,
6.9766,
6.6758,
6.3164,
6.8828,
6.8203,
6.7500,
6.5352,
7.3008,
6.7852,
6.1914,
5.0508,
6.7188,
7.1172,
6.8008,
6.8086,
5.4883,
6.9180,
6.5742,
6.1719,
7.0469,
7.1523,
5.9492,
5.8594,
6.8320,
6.1719,
6.2031,
6.8398,
7.3008,
6.6289,
6.4922,
6.0000,
5.4766,
6.3320,
6.5117,
6.2812,
7.5742,
6.3516,
7.0039,
6.4570,
7.1523,
7.6289,
6.2578,
7.1875,
6.4844,
5.7930,
6.7070,
7.5508,
7.1797,
6.0430,
6.8711,
6.5742,
7.5781,
6.4766,
6.5391,
6.9453,
6.1992,
6.6367,
6.2812,
6.0234,
6.6953,
7.0312,
6.2031,
6.5625,
6.6719,
6.1719,
6.5586,
5.7031,
7.4609,
6.6211,
7.7227,
6.9141,
6.0469,
6.2500,
5.3828,
6.0078,
5.8164,
5.8867,
6.1523,
6.6523,
6.6953,
7.3125,
6.4844,
5.9570,
5.9531,
6.2109,
5.5039,
6.5117,
6.8203,
6.6133,
6.4766,
5.9297,
7.1445,
7.1914,
6.0117,
6.8281,
6.7422,
6.1328,
6.9805,
6.5625,
6.9180,
7.1133,
7.3359,
5.7617,
5.8711,
6.4961,
6.5859,
6.2422,
6.5273,
6.7461,
6.6992,
6.7695,
6.6289,
5.9453,
5.9805,
7.1172,
6.6719,
6.0039,
7.6875,
6.7812,
7.8359,
6.9531,
7.4336,
7.6602,
6.8164,
7.3945,
7.1602,
6.8789,
5.0078,
6.0547,
6.8086,
6.7070,
6.4688,
6.4492,
6.6172,
5.5625,
6.6914,
6.4297,
5.7461,
5.3359,
6.8750,
6.4609,
7.4062,
5.2070,
6.0820,
6.7383,
6.5703,
6.1797,
6.7070,
6.5977,
5.9961,
6.6328,
6.9375,
6.3906,
6.6484,
4.9609,
6.6445,
6.5898,
7.1875,
7.5195,
6.7969,
6.1367,
6.8906,
7.4297,
6.3633,
6.0508,
6.5000,
6.4648,
6.7539,
6.7109,
5.8086,
6.6016,
7.1133,
4.8672,
6.6367,
6.1641,
5.1758,
6.9453,
6.3242,
7.0664,
6.4805,
6.3516,
6.7383,
8.4688,
6.7305,
5.9844,
6.5938,
7.2969,
6.5977,
7.5898,
6.2969,
6.8672,
6.6680,
7.1289,
6.6875,
5.4258,
8.1875,
8.0391,
7.7969,
6.6445,
7.0703,
7.3359,
6.9805,
6.6328,
6.5352,
6.2422,
5.5820,
6.8633,
6.8047,
6.5703,
6.0117,
6.7539,
7.1719,
6.8438,
7.3633,
6.6016,
7.2070,
6.4727,
5.8008,
7.4062,
7.4805,
6.6445,
5.9023,
6.3984,
6.9961,
6.6680,
6.8242,
6.7148,
6.6172,
6.9727,
6.8320,
5.9766,
6.6133,
5.5977,
6.7773,
7.3906,
6.9219,
7.0781,
6.6914,
5.7539,
6.7969,
6.8008,
5.8047,
7.1055,
6.4961,
6.0352,
5.6211,
7.4414,
7.0703,
6.1172,
6.7461,
6.4492,
7.7148,
6.4258,
6.0039,
6.5156,
7.2188,
7.4531,
7.4844,
7.5938,
7.4023,
6.7617,
6.0078,
6.3320,
5.8906,
7.5977,
5.6523,
6.7734,
6.3008,
5.2227,
7.1719,
7.1289,
6.6602,
5.4609,
7.0312,
6.0820,
6.1719,
6.0000,
6.5547,
6.6328,
7.0547,
7.0859,
6.2656,
5.5234,
6.0273,
6.7891,
7.1875,
6.9531,
6.8203,
6.3516,
6.1172,
6.4648,
6.9180,
7.3906,
6.2812,
5.7109,
6.1484,
6.9102,
6.8711,
7.0156,
6.1445,
5.8867,
6.3828,
5.9961,
6.6914,
6.7891,
7.0820,
6.6719,
6.9297,
6.3750,
6.7578,
6.4883,
6.2227,
6.2305,
6.0508,
6.6484,
5.7578,
7.2070,
7.2383,
6.9375,
7.2578,
6.5312,
6.0312,
6.7930,
6.2578,
7.0625,
7.2148,
6.4961,
7.0703,
6.4727,
7.3906,
]
).to(torch.float16)
class TestsQ4Exllama(unittest.TestCase):
def test_exllama(self):
group_size = 128
m = 1
k = 1024
n = 1024
device = torch.device("cuda:0")
linear_class = dynamically_import_QuantLinear(
use_triton=False,
desc_act=False,
group_size=group_size,
bits=4,
disable_exllama=False,
disable_exllamav2=True,
)
linear = linear_class(
bits=4,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
)
self.assertTrue(isinstance(linear, QuantLinear))
torch.manual_seed(42)
linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
linear.scales = linear.scales + 0.002
linear = linear.eval()
linear = linear.to(device)
linear = autogptq_post_init(linear, use_act_order=False)
max_inner_outer_dim = max(k, n)
max_dq_buffer_size = linear.infeatures * linear.outfeatures
max_input_len = 2048
buffers = {
"temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
"temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
}
prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
# Using the default from exllama repo here.
matmul_recons_thd = 8
matmul_fused_remap = False
matmul_no_half2 = False
set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
inp = torch.rand(1, m, k, dtype=torch.float16).to(device)
with torch.no_grad():
res = linear(inp)[0][0]
reference = CUDA_OLD_REFERENCE.to(device)
self.assertTrue(
torch.allclose(res, reference, rtol=3e-5, atol=2e-2),
get_diff(res, reference),
)
def test_exllama_buffer_size(self):
prompt = "I am in Paris and" * 450
device = torch.device("cuda:0")
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
disable_exllama=False,
disable_exllamav2=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
self.assertTrue(
inp["input_ids"].shape[1] > EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
) # 2048 is the default max_input_length
with self.assertRaises(RuntimeError) as cm:
_ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
self.assertTrue("temp_state buffer is too small" in str(cm.exception))
model_q = exllama_set_max_input_length(model_q, 4096)
_ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
model_q = exllama_set_max_input_length(model_q, 1034)
with self.assertRaises(RuntimeError) as cm:
_ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
self.assertTrue("temp_state buffer is too small" in str(cm.exception))
def test_generation_no_act_order(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=False,
disable_exllamav2=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_generation_with_act_order(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
disable_exllama=False,
disable_exllamav2=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_multigpu(self):
# TODO
pass
class TestsQ4CUDA(unittest.TestCase):
REFERENCE_OLD_HALF = torch.Tensor(
[
1.5332,
2.1250,
1.7910,
1.8008,
1.9688,
1.3262,
1.7627,
1.8164,
1.9307,
1.8574,
1.5449,
1.5293,
1.6074,
1.5566,
1.8545,
1.6582,
1.8838,
2.0215,
1.8525,
1.2920,
1.9561,
2.2617,
1.7891,
2.2656,
1.6543,
2.0566,
1.4756,
1.1826,
1.8174,
2.1191,
1.6641,
2.0586,
1.6182,
1.7627,
1.7920,
1.4424,
2.0723,
1.6865,
1.2979,
2.0840,
1.6729,
1.9648,
2.1602,
1.6006,
1.2773,
2.2129,
1.8057,
1.7285,
1.6621,
1.6475,
1.4805,
1.7959,
1.5010,
0.8643,
2.6680,
2.0918,
1.8555,
1.9795,
1.3271,
1.8359,
1.6338,
1.9766,
1.7881,
1.6025,
1.7637,
1.7012,
1.7852,
1.5674,
0.8091,
1.7188,
1.6123,
1.8525,
1.4434,
1.9590,
1.5801,
1.4209,
1.7178,
1.8408,
2.4141,
1.9658,
1.4922,
2.1992,
1.9473,
1.8047,
1.2979,
1.6396,
1.6221,
1.5020,
1.9941,
1.7725,
1.6064,
1.5449,
1.8418,
1.2656,
1.4824,
1.7734,
2.0098,
1.7197,
1.7686,
1.4160,
1.7275,
2.1738,
1.9609,
1.7686,
1.6396,
2.1465,
1.2188,
1.2002,
2.1113,
1.7227,
1.5811,
1.7607,
2.2773,
1.8945,
1.4111,
1.5801,
1.7744,
2.0684,
2.1621,
1.8027,
1.1045,
1.9648,
2.2402,
2.0742,
1.3330,
1.5840,
2.1465,
2.0176,
1.5068,
1.9834,
1.7725,
1.5527,
1.7803,
1.7744,
1.5312,
1.2695,
1.9209,
2.0469,
1.6777,
2.5215,
1.8389,
1.7598,
1.5498,
1.6807,
1.7324,
1.5938,
1.9268,
1.7734,
1.4463,
2.0391,
2.0527,
2.2129,
1.6787,
2.0586,
1.8975,
1.5713,
1.6992,
1.8770,
1.7207,
1.7080,
1.1611,
1.8584,
2.4570,
1.6016,
1.4834,
1.1777,
1.7969,
1.8955,
1.8906,
1.6738,
1.7510,
1.4316,
1.8340,
2.2461,
1.7744,
2.1934,
1.4824,
1.8828,
1.6387,
2.4629,
1.8887,
1.5137,
1.4648,
1.6406,
1.7188,
2.2656,
1.5801,
2.1484,
2.0625,
2.0098,
1.7549,
1.1768,
1.4385,
2.0723,
1.6172,
1.7832,
1.8301,
1.6064,
1.5215,
1.9297,
2.3750,
2.1504,
1.7070,
1.1289,
1.4473,
1.5674,
1.6836,
2.2930,
1.1221,
1.5557,
1.7559,
1.8281,
2.0703,
1.9443,
2.0684,
2.2988,
1.6348,
2.3379,
2.4414,
1.8857,
2.0039,
1.4844,
1.5488,
1.6514,
2.3711,
1.9941,
2.3066,
1.4287,
2.1777,
1.6445,
1.6025,
1.5938,
1.5508,
1.9502,
2.1309,
1.2666,
1.1523,
1.9561,
1.8584,
1.9746,
1.5986,
1.9688,
2.1973,
1.1523,
2.3281,
1.2451,
1.8447,
2.2051,
1.5254,
1.5342,
2.1016,
1.6523,
1.6279,
1.1680,
1.3037,
2.1035,
]
).to(torch.float16)
REFERENCE_OLD_NO_HALF = torch.Tensor(
[
1.5332,
2.1250,
1.7910,
1.7998,
1.9678,
1.3262,
1.7617,
1.8154,
1.9307,
1.8574,
1.5449,
1.5293,
1.6074,
1.5557,
1.8545,
1.6582,
1.8838,
2.0195,
1.8525,
1.2920,
1.9561,
2.2617,
1.7891,
2.2656,
1.6543,
2.0566,
1.4756,
1.1826,
1.8164,
2.1191,
1.6641,
2.0586,
1.6182,
1.7617,
1.7920,
1.4424,
2.0723,
1.6865,
1.2969,
2.0840,
1.6729,
1.9639,
2.1602,
1.5996,
1.2773,
2.2129,
1.8057,
1.7275,
1.6621,
1.6475,
1.4805,
1.7949,
1.5010,
0.8643,
2.6680,
2.0918,
1.8545,
1.9795,
1.3271,
1.8350,
1.6338,
1.9766,
1.7881,
1.6025,
1.7637,
1.7012,
1.7842,
1.5664,
0.8086,
1.7188,
1.6113,
1.8516,
1.4434,
1.9590,
1.5801,
1.4209,
1.7168,
1.8408,
2.4141,
1.9658,
1.4922,
2.1973,
1.9463,
1.8047,
1.2979,
1.6396,
1.6221,
1.5010,
1.9941,
1.7725,
1.6064,
1.5449,
1.8418,
1.2656,
1.4824,
1.7734,
2.0098,
1.7188,
1.7686,
1.4160,
1.7266,
2.1738,
1.9600,
1.7686,
1.6396,
2.1465,
1.2188,
1.2002,
2.1113,
1.7227,
1.5811,
1.7598,
2.2773,
1.8936,
1.4102,
1.5801,
1.7734,
2.0684,
2.1621,
1.8027,
1.1045,
1.9648,
2.2402,
2.0742,
1.3330,
1.5840,
2.1465,
2.0176,
1.5068,
1.9834,
1.7725,
1.5527,
1.7793,
1.7744,
1.5312,
1.2695,
1.9209,
2.0469,
1.6777,
2.5195,
1.8389,
1.7598,
1.5498,
1.6797,
1.7324,
1.5928,
1.9258,
1.7734,
1.4463,
2.0391,
2.0508,
2.2129,
1.6787,
2.0586,
1.8975,
1.5713,
1.6992,
1.8770,
1.7207,
1.7070,
1.1602,
1.8584,
2.4570,
1.6016,
1.4834,
1.1777,
1.7959,
1.8955,
1.8906,
1.6738,
1.7510,
1.4316,
1.8330,
2.2461,
1.7744,
2.1934,
1.4824,
1.8828,
1.6387,
2.4629,
1.8887,
1.5137,
1.4648,
1.6406,
1.7178,
2.2637,
1.5801,
2.1484,
2.0605,
2.0098,
1.7539,
1.1768,
1.4375,
2.0723,
1.6162,
1.7832,
1.8291,
1.6064,
1.5215,
1.9297,
2.3750,
2.1504,
1.7061,
1.1289,
1.4473,
1.5674,
1.6836,
2.2930,
1.1221,
1.5547,
1.7559,
1.8281,
2.0703,
1.9443,
2.0684,
2.2988,
1.6348,
2.3379,
2.4414,
1.8857,
2.0020,
1.4834,
1.5488,
1.6514,
2.3711,
1.9941,
2.3047,
1.4277,
2.1777,
1.6445,
1.6025,
1.5938,
1.5508,
1.9502,
2.1309,
1.2666,
1.1514,
1.9551,
1.8584,
1.9746,
1.5986,
1.9688,
2.1953,
1.1514,
2.3262,
1.2451,
1.8447,
2.2051,
1.5254,
1.5342,
2.1016,
1.6523,
1.6279,
1.1680,
1.3037,
2.1035,
]
).to(torch.float16)
@parameterized.expand([(False,), (True,)])
def test_cuda_old(self, use_half2: bool):
group_size = 128
# test the 256 kernel (in_features % 256 == 0 and out_features % 256 == 0)
m = 1
k = 256
n = 256
device = "cuda"
linear_class = dynamically_import_QuantLinear(
use_triton=False,
desc_act=False,
group_size=group_size,
bits=4,
disable_exllama=True,
disable_exllamav2=True,
)
weight_dtype = torch.float16 if use_half2 else torch.float32
linear = linear_class(
bits=4,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
weight_dtype=weight_dtype,
)
torch.manual_seed(42)
linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
linear.scales = linear.scales + 0.002
linear.use_cuda_fp16 = use_half2
self.assertTrue(linear.autogptq_cuda_available)
# We cast twice just for the seed.
inp = torch.rand(1, m, k, dtype=torch.float16).to(device).to(weight_dtype)
linear = linear.eval()
linear = linear.to(device)
with torch.no_grad():
res = linear(inp)[0][0]
if use_half2:
reference = self.REFERENCE_OLD_HALF.to(device).to(weight_dtype)
else:
reference = self.REFERENCE_OLD_NO_HALF.to(device).to(weight_dtype)
self.assertTrue(torch.allclose(res, reference, rtol=1e-3), get_diff(res, reference))
@parameterized.expand(
[
(torch.float32, "cpu"),
(torch.float32, "cuda:0"),
(torch.float16, "cuda:0"),
]
)
def test_generation_with_act_order(self, torch_dtype, device):
prompt = "I am in Paris and"
# Reference generated with the cuda-old kernel
if device == "cpu":
# CPU implementation is extremely slow.
new_tokens = 2
reference_output = "<s> I am in Paris and it is"
else:
reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
new_tokens = 60
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
@parameterized.expand(
[
(torch.float32, "cpu"),
(torch.float32, "cuda:0"),
(torch.float16, "cuda:0"),
]
)
def test_generation_no_act_order(self, torch_dtype, device):
prompt = "I am in Paris and"
# Reference generated with the cuda-old kernel
if device == "cpu":
# CPU implementation is extremely slow.
new_tokens = 3
reference_output = "<s> I am in Paris and I am going"
else:
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
new_tokens = 60
model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
device=device,
use_triton=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
class TestsQ4ExllamaV2(unittest.TestCase):
def test_exllamav2(self):
from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
group_size = 128
m = 1
k = 1024
n = 1024
device = torch.device("cuda:0")
linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
linear = linear_class(
bits=4,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
)
self.assertTrue(isinstance(linear, QuantLinear))
torch.manual_seed(42)
linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
linear.scales = linear.scales + 0.002
linear = linear.eval()
linear = linear.to(device)
linear = autogptq_post_init(linear, use_act_order=False)
inp = torch.rand(1, m, k, dtype=torch.float16).to(device)
with torch.no_grad():
res = linear(inp)[0][0]
reference = CUDA_OLD_REFERENCE.to(device)
self.assertTrue(
torch.allclose(res, reference, rtol=3e-5, atol=2e-2),
get_diff(res, reference),
)
def test_generation_no_act_order(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_triton=False)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_generation_with_act_order(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_exllama_v2_buffer_size(self):
# prompt = "I'm in Paris and" * 450
prompt = "I'm in Paris and" * 500
device = torch.device("cuda:0")
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
self.assertTrue(inp["input_ids"].shape[1] > 2048) # 2048 is the default max_input_length for LLama
_ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
class TestsMixtral(unittest.TestCase):
def test_mixtral_generation(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = """<s> I am in Paris andpublishedющиеcs performancesension manual offset亡VIDEO Kel RepubliczwDrawlichen LondresPSungspfn CreahooEESlider laughselvesлександTrytpl recallслу Ор coldsubset########serdeacion providestrm thoughts président oktobermulticol../редβ themselvesterraряд conflictscommandMass diagonal選 ptrTY還 Havepliedument relate redu"""
model_id = "TheBlokeAI/Mixtral-tiny-GPTQ"
model_basename = "model"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
use_safetensors=True,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60, do_sample=False)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
class TestQ4Marlin(unittest.TestCase):
def test_generation(self):
# Reference generated with the cuda-old kernel and TheBloke/Llama-2-7B-Chat-GPTQ
reference_output = "<s> I am in Paris and I am feeling very sad and lonely. everybody I know is busy and I don't have any friends here. I am staying in a small apartment in the 11th arrondissement and I am feeling very isolated. I miss my friends and family back home and I don'"
prompt = "I am in Paris and"
device = torch.device("cuda:0")
model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
try:
model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_marlin=True)
except ValueError as e:
if torch.version.hip:
self.assertTrue("Can not use Marlin int4*fp16 kernel with AMD ROCm" in e.text)
self.skipTest("Can not run this test on ROCm")
else:
raise e
has_marlin = False
for _, module in model_q.named_modules():
if isinstance(module, MarlinQuantLinear):
has_marlin = True
break
self.assertTrue(has_marlin)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_bias(self):
# TheBloke/Llama-2-7B-Chat-GPTQ has bias, but they are all zeros, use a checkpoint which really uses bias.
model_id = "s3nh/starcoderbase-1b-GPTQ"
try:
model_q = AutoGPTQForCausalLM.from_quantized(model_id, device="cuda:0", use_marlin=True)
except ValueError as e:
if torch.version.hip:
self.assertTrue("Can not use Marlin int4*fp16 kernel with AMD ROCm" in e.text)
self.skipTest("Can not run this test on ROCm")
else:
raise e
for _, param in model_q.named_parameters():
self.assertTrue(param.device != torch.device("meta"))
for _, param in model_q.named_buffers():
self.assertTrue(param.device != torch.device("meta"))
self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_proj.bias) > 0)
self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_attn.bias) > 0)
tokenizer = AutoTokenizer.from_pretrained("Xenova/starcoderbase-1b")
prompt = "Today I am in Paris and"
inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertTrue(predicted_text.startswith("Today I am in Paris and I am a student of the Master's"))
class TestsQ4Triton(unittest.TestCase):
def test_generation_no_act_order(self):
prompt = "I am in Paris and"
reference_output = "<s> I am in Paris and I am going to the Louvre Museum. What time does it open and what is the best way to get there?\nThe Louvre Museum in Paris is open from 9:00 AM to 6:00 PM every day except for Tuesdays. The best way to get"
new_tokens = 60
model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
device="cuda:0",
use_triton=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch.float16,
use_tritonv2=True,
)
for _, submodule in model_q.named_modules():
if isinstance(submodule, TritonV2QuantLinear):
break
else:
raise ValueError("Did not find a tritonv2 linear layer")
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
# This one uses Autocast.
res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
# This one does not.
res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
def test_generation_with_act_order(self):
prompt = "I am in Paris and"
device = torch.device("cuda:0")
# Reference generated with the cuda-old kernel
reference_output = "<s> I am in Paris and it is a beautiful day. I am sitting in a café, drinking coffee and writing this book. I am surrounded by the sights and sounds of the city, and I am filled with a sense of contentment and gratitude.\n\nI am grateful for the opportunity to live and"
model_id = "TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g"
revision = "actorder"
model_basename = "vicuna-13B-1.1-GPTQ-4bit-128g.latest"
model_q = AutoGPTQForCausalLM.from_quantized(
model_id,
revision=revision,
device="cuda:0",
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
model_basename=model_basename,
disable_exllama=True,
disable_exllamav2=True,
use_tritonv2=True,
)
for _, submodule in model_q.named_modules():
if isinstance(submodule, TritonV2QuantLinear):
break
else:
raise ValueError("Did not find a tritonv2 linear layer")
tokenizer = AutoTokenizer.from_pretrained(model_id)
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
class TestQ4HPU(unittest.TestCase):
@parameterized.expand(
[
("hpu", torch.bfloat16),
("hpu", torch.float),
]
)
def test_generation(self, in_device, model_dtype):
try:
import habana_frameworks.torch.core as htcore
except Exception as e:
self.skipTest("Couldn't import HPU plugin, skipping HPU tests")
# Reference generated with the cuda-old kernel and TheBloke/Llama-2-7B-Chat-GPTQ
reference_output = "<s> I am in Paris and I am feeling very sad and lonely. everybody I know is busy and I don't have any friends here. I am staying in a small apartment in the 11th arrondissement and I am feeling very isolated. I miss my friends and family back home and I don'"
prompt = "I am in Paris and"
device = torch.device(in_device)
model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
try:
from transformers import GPTQConfig, AutoModelForCausalLM
quantization_config = GPTQConfig(bits=4, use_exllama=False)
model_kwargs = {
"revision": "main",
"token": None
}
model_q = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs)
model_q = model_q.eval().to(device)
except ValueError as e:
if torch.version.hip:
self.assertTrue("Can not use HPU int4 kernel" in e.text)
self.skipTest("Can not run this test on HPU")
else:
raise e
tokenizer_kwargs = {
"revision": "main",
"token": None
}
tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_kwargs)
if not model_q.config.is_encoder_decoder:
tokenizer.padding_side = "left"
# Some models like GPT2 do not have a PAD token so we have to set it if necessary
if model_q.config.model_type == "llama":
# unwind broken decapoda-research config
model_q.generation_config.pad_token_id = 0
model_q.generation_config.bos_token_id = 1
model_q.generation_config.eos_token_id = 2
tokenizer.bos_token_id = model_q.generation_config.bos_token_id
tokenizer.eos_token_id = model_q.generation_config.eos_token_id
tokenizer.pad_token_id = model_q.generation_config.pad_token_id
tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model_q.generation_config.pad_token_id = model_q.generation_config.eos_token_id
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertEqual(predicted_text, reference_output)
@parameterized.expand(
[
("hpu", torch.bfloat16),
("hpu", torch.float),
]
)
def test_bias(self, in_device, model_dtype):
try:
import habana_frameworks.torch.core as htcore
except Exception as e:
self.skipTest("Couldn't import HPU plugin, skipping HPU tests")
device = torch.device(in_device)
# TheBloke/Llama-2-7B-Chat-GPTQ has bias, but they are all zeros, use a checkpoint which really uses bias.
model_id = "s3nh/starcoderbase-1b-GPTQ"
try:
model_kwargs = {
"revision": "main",
"token": None
}
model_q = AutoGPTQForCausalLM.from_quantized(model_id, torch_dtype=model_dtype, use_marlin=False, **model_kwargs)
model_q = model_q.eval().to(device)
except ValueError as e:
if torch.version.hip:
self.assertTrue("Can not use HPU int4 kernel" in e.text)
self.skipTest("Can not run this test on HPU")
else:
raise e
for _, param in model_q.named_parameters():
self.assertTrue(param.device != torch.device("meta"))
for _, param in model_q.named_buffers():
self.assertTrue(param.device != torch.device("meta"))
self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_proj.bias) > 0)
self.assertTrue(torch.count_nonzero(model_q.model.transformer.h[0].attn.c_attn.bias) > 0)
tokenizer_kwargs = {
"revision": "main",
"token": None
}
tokenizer = AutoTokenizer.from_pretrained("Xenova/starcoderbase-1b", **tokenizer_kwargs)
if not model_q.config.is_encoder_decoder:
tokenizer.padding_side = "left"
# Some models like GPT2 do not have a PAD token so we have to set it if necessary
if model_q.config.model_type == "llama":
# unwind broken decapoda-research config
model_q.generation_config.pad_token_id = 0
model_q.generation_config.bos_token_id = 1
model_q.generation_config.eos_token_id = 2
tokenizer.bos_token_id = model_q.generation_config.bos_token_id
tokenizer.eos_token_id = model_q.generation_config.eos_token_id
tokenizer.pad_token_id = model_q.generation_config.pad_token_id
tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model_q.generation_config.pad_token_id = model_q.generation_config.eos_token_id
prompt = "Today I am in Paris and"
inp = tokenizer(prompt, return_tensors="pt").to(device)
res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
predicted_text = tokenizer.decode(res[0])
self.assertTrue(predicted_text.startswith("Today I am in Paris and I am a student of the Master's"))
import os
import tempfile
import unittest
import torch.cuda
from parameterized import parameterized
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.quantization import CHECKPOINT_FORMAT, QUANT_CONFIG_FILENAME, BaseQuantizeConfig
class TestQuantization(unittest.TestCase):
@parameterized.expand([(False,), (True,)])
def test_quantize(self, use_marlin: bool):
pretrained_model_dir = "saibo/llama-1B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
),
tokenizer(
"Today I am in Paris and it is a wonderful day."
),
]
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else CHECKPOINT_FORMAT.GPTQ,
)
model = AutoGPTQForCausalLM.from_pretrained(
pretrained_model_dir,
quantize_config=quantize_config,
use_flash_attention_2=False,
)
model.quantize(examples)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", use_marlin=use_marlin)
del model
torch.cuda.empty_cache()
# test compat: 1) with simple dict type 2) is_marlin_format
compat_quantize_config = {
"bits": 4,
"group_size": 128,
"desc_act": False,
"is_marlin_format": use_marlin,
}
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", quantize_config=compat_quantize_config)
assert(isinstance(model.quantize_config, BaseQuantizeConfig))
del model
torch.cuda.empty_cache()
# test checkinpoint_format hint to from_quantized()
os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
compat_quantize_config = {
"bits": 4,
"group_size": 128,
"desc_act": False,
}
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0",
quantize_config=compat_quantize_config,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else None)
assert (isinstance(model.quantize_config, BaseQuantizeConfig))
import copy
import unittest
import autogptq_marlin_cuda
import torch
import torch.nn as nn
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_marlin import _get_perms, dequantize_weight
def gen_quant4(k, n, groupsize=-1):
maxq = 2 ** 4 - 1
w = torch.randn((k, n), dtype=torch.half, device="cpu")
original_w = w.clone()
if groupsize != -1:
w = w.reshape((-1, groupsize, n))
w = w.permute(1, 0, 2)
w = w.reshape((groupsize, -1))
s = torch.max(torch.abs(w), 0, keepdim=True)[0]
s *= 2 / maxq
# Quantize.
w = torch.round(w / s).int()
# Unsigned storage.
w += (maxq + 1) // 2
w = torch.clamp(w, 0, maxq)
# Dequantize.
ref = (w - (maxq + 1) // 2).half() * s
if groupsize != -1:
def reshape(w):
w = w.reshape((groupsize, -1, n))
w = w.permute(1, 0, 2)
w = w.reshape((k, n)).contiguous()
return w
ref = reshape(ref)
w = reshape(w)
s = s.reshape((-1, n)).contiguous()
linear = nn.Linear(k, n, bias=False)
linear.weight.data = ref.t()
return original_w, linear, s
original_w, linear, s = gen_quant4(64, 128)
class TestRepacking(unittest.TestCase):
def test_marlin_fast_repacking(self):
k = 2048
n = 1024
m = 5
group_size = 128
_, linear, s = gen_quant4(k, n, group_size)
cuda_old_linear = CudaOldQuantLinear(bits=4, group_size=group_size, infeatures=k, outfeatures=n, bias=False)
zeros = torch.full((k // group_size, n), 8, dtype=torch.int32)
cuda_old_linear.pack(linear, s.T, zeros.T, g_idx=None)
# Adapted from utils.marlin_utils.convert_to_marlin
dequantized_weight, dequantized_qzeros = dequantize_weight(cuda_old_linear)
dequantized_weight = dequantized_weight.to(torch.float16)
self.assertTrue(torch.all(dequantized_qzeros == 8))
linear_module = torch.nn.Linear(
in_features=k,
out_features=n,
bias=False,
dtype=torch.float16,
device="cuda",
)
linear_module.weight.data.copy_(linear.weight.data) # Not using dequantized_weight to avoid approx
# Create new linear method and copy to model.
marlin_linear = MarlinQuantLinear(
bits=4,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
trainable=False,
)
marlin_linear.pack(linear_module.to("cuda"), scales=copy.deepcopy(cuda_old_linear.scales.data.t()).to("cuda"))
inp = torch.rand(m, k, dtype=torch.float16, device="cuda")
cuda_old_linear = cuda_old_linear.to("cuda")
marlin_linear = marlin_linear.to("cuda")
with torch.no_grad():
res_cuda_old = cuda_old_linear(inp)
res_marlin = marlin_linear(inp)
reldiff = (res_cuda_old - res_marlin).abs() / (res_cuda_old.abs() + 1e-12)
self.assertTrue(torch.mean(reldiff) < 4e-3)
weight_repacked = autogptq_marlin_cuda.gptq_repack(cuda_old_linear.qweight)
self.assertTrue(torch.allclose(weight_repacked, marlin_linear.B))
_, _scale_perm, _scale_perm_single = _get_perms()
s = cuda_old_linear.scales.data.clone()
if group_size != k:
s = s.reshape((1, -1))
s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
else:
s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
s = s.reshape((-1, n)).contiguous()
self.assertTrue(torch.allclose(s, marlin_linear.s))
import json
import os
import tempfile
import time
import unittest
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.quantization import CHECKPOINT_FORMAT, CHECKPOINT_FORMAT_FIELD, QUANT_CONFIG_FILENAME
from auto_gptq.quantization.config import QUANT_METHOD, BaseQuantizeConfig
class TestSerialization(unittest.TestCase):
MODEL_ID = "habanoz/TinyLlama-1.1B-Chat-v0.3-GPTQ"
def setUp(self):
dummy_config = BaseQuantizeConfig(
model_name_or_path=self.MODEL_ID,
quant_method=QUANT_METHOD.GPTQ,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN)
model_cache_path, is_cached = dummy_config.get_cache_file_path()
if is_cached:
os.remove(model_cache_path)
def test_marlin_local_serialization(self):
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
end = time.time()
first_load_time = end - start
with tempfile.TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "model.safetensors")))
model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
self.assertFalse(os.path.isfile(os.path.join(tmpdir, model_cache_path)))
with open(os.path.join(tmpdir, QUANT_CONFIG_FILENAME), "r") as config_file:
config = json.load(config_file)
self.assertTrue(config[CHECKPOINT_FORMAT_FIELD] == CHECKPOINT_FORMAT.MARLIN)
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(tmpdir, device="cuda:0", use_marlin=True)
end = time.time()
second_load_time = end - start
# Since we use a CUDA kernel to repack weights, the first load time is already small.
self.assertTrue(second_load_time < first_load_time)
def test_marlin_hf_cache_serialization(self):
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
end = time.time()
first_load_time = end - start
model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
self.assertTrue("assets" in model_cache_path)
self.assertTrue(is_cached)
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
end = time.time()
second_load_time = end - start
# Since we use a CUDA kernel to repack weights, the first load time is already small.
self.assertTrue(second_load_time < first_load_time)
import unittest
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
class TestShardedLoading(unittest.TestCase):
def test_loading(self):
model_name = "TheBlokeAI/llama-68m-GPTQ-sharded"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(model_name, device='cuda:0',)
tokens = model.generate(**tokenizer("1337", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
result = tokenizer.decode(tokens)
self.assertTrue(result == '<s> 133777777777777777777777')
def test_loading_large(self):
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4")
model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4", device='cuda:0')
tokens = model.generate(**tokenizer("Today I am in Paris and", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
result = tokenizer.decode(tokens)
self.assertTrue(result == 'Today I am in Paris and I am going to the Louvre Museum. I want to see the Mona Lisa painting, but I')
import os
import unittest
import torch
import torch.utils.benchmark as benchmark
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
MODEL_ID = "TheBloke/Llama-7B-GPTQ"
DATASET_ID = "timdettmers/openassistant-guanaco"
LEARNING_RATE = 3e-5
MAX_SEQ_LEN = 10
BATCH_SIZE = 5
NUM_TRAIN_STEPS = 10
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def benchmark_forward(
fn,
*inputs,
repeats="auto",
desc="",
verbose=True,
amp=False,
amp_dtype=torch.float16,
**kwinputs,
):
if verbose:
print(desc, "- Forward pass")
def amp_wrapper(*inputs, **kwinputs):
with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
fn(*inputs, **kwinputs)
t = benchmark.Timer(
stmt="fn_amp(*inputs, **kwinputs)",
globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
num_threads=torch.get_num_threads(),
)
if repeats == "auto":
m = t.blocked_autorange()
else:
m = t.timeit(repeats)
if verbose:
print(m)
return t, m
def get_model_and_tokenizer(
model_id=MODEL_ID,
inject_fused_attention=False,
inject_fused_mlp=False,
**model_kwargs,
):
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
use_fast=True,
)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoGPTQForCausalLM.from_quantized(
model_id,
trainable=True,
inject_fused_attention=inject_fused_attention,
inject_fused_mlp=inject_fused_mlp,
disable_exllamav2=True,
disable_exllama=True,
**model_kwargs,
)
model.warmup_triton()
return model, tokenizer
class TestTriton(unittest.TestCase):
def test_triton_qlinear(self):
ref_model, _ = get_model_and_tokenizer(
model_id=MODEL_ID,
use_triton=True,
inject_fused_attention=False,
inject_fused_mlp=False,
)
test_model, _ = get_model_and_tokenizer(
model_id=MODEL_ID,
use_tritonv2=True,
inject_fused_attention=False,
inject_fused_mlp=False,
)
hidden_size = ref_model.model.model.embed_tokens.weight.shape[1]
test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda()
qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
qlinear_test = test_model.model.model.layers[0].self_attn.q_proj
test_out = qlinear_test(test_data)
ref_out = qlinear_ref(test_data)
self.assertTrue(torch.allclose(test_out, ref_out))
_, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)
_, measure_tritonv2 = benchmark_forward(qlinear_test, test_data, desc="Triton-v2", verbose=True)
self.assertTrue(measure_tritonv2.mean < measure_triton.mean)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment