Commit f8070792 authored by yangql's avatar yangql
Browse files

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini,...

Deleted tests/__init__.py, tests/bench_autoawq_autogptq.py, tests/pytest.ini, tests/test_awq_compatibility_generation.py, tests/test_hpu_linear.py, tests/test_peft_conversion.py, tests/test_q4.py, tests/test_quantization.py, tests/test_repacking.py, tests/test_serialization.py, tests/test_sharded_loading.py, tests/test_triton.py files
parent a2630e0f
import torch
try:
from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this benchmark. {e}"
)
import numpy as np
from auto_gptq.modeling._utils import autogptq_post_init
from auto_gptq.nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
group_size = 128
bits = 4
# Yi 34B down_proj
k = 20480
n = 7168
device = torch.device("cuda:0")
linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=4)
linear_gptq = linear_class(
bits=bits,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
)
assert isinstance(linear_gptq, QuantLinear)
linear_gptq = linear_gptq.eval()
linear_gptq = linear_gptq.to(device)
linear_gptq = autogptq_post_init(linear_gptq, use_act_order=False)
num_runs = 60
lines = []
seqlens = [
1,
2,
3,
4,
5,
6,
7,
8,
12,
16,
24,
32,
48,
64,
80,
120,
250,
512,
1024,
2048,
4000,
8000,
]
print(f"in_features={k}, out_features={n}")
for query_length in seqlens:
# batch_size, query_length, hidden_size
inp = torch.rand(1, query_length, k, dtype=torch.float16).to(device)
torch.cuda.empty_cache()
# Warmup Exllama v2
with torch.no_grad():
res = linear_gptq(inp)
latencies = []
torch.cuda.synchronize()
for _ in range(num_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
res = linear_gptq(inp)
end_event.record()
torch.cuda.synchronize()
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
# print("-------")
# print(f"Latency GPTQ Exllama v2 (query_length={query_length}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
exllamav2_mean_latency = np.mean(latencies)
exllamav2_p10 = np.percentile(latencies, 10)
exllamav2_p90 = np.percentile(latencies, 90)
torch.cuda.empty_cache()
total_seqlen = inp.shape[:-1].numel()
if total_seqlen <= 8:
awq_kernel = "GEMV"
linear_awq = WQLinear_GEMV(
w_bit=bits,
group_size=group_size,
in_features=k,
out_features=n,
bias=False,
dev=device,
)
else:
awq_kernel = "GEMM"
linear_awq = WQLinear_GEMM(
w_bit=bits,
group_size=group_size,
in_features=k,
out_features=n,
bias=False,
dev=device,
)
# Warmup AWQ
with torch.no_grad():
res = linear_awq(inp)
latencies = []
torch.cuda.synchronize()
for _ in range(num_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
res = linear_awq(inp)
end_event.record()
torch.cuda.synchronize()
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
awq_mean_latency = np.mean(latencies)
awq_p10 = np.percentile(latencies, 10)
awq_p90 = np.percentile(latencies, 90)
exllama_speedup = awq_mean_latency / exllamav2_mean_latency
# print(f"Latency AWQ (query_length={query_length}, kernel={awq_kernel}): {np.mean(latencies):.3f} ms, p10={np.percentile(latencies, 10):.3f}, p90={np.percentile(latencies, 90):.3f}")
line = "{},{},{},{},{},{},{},{},{},{},{}".format(
bits,
group_size,
total_seqlen,
awq_kernel,
f"{awq_mean_latency:.3f}",
f"{exllamav2_mean_latency:.3f}",
f"{awq_p10:.3f}",
f"{awq_p90:.3f}",
f"{exllamav2_p10:.3f}",
f"{exllamav2_p90:.3f}",
f"{exllama_speedup:.3f}",
)
lines.append(line)
header = "bits, group_size, total_seqlen, awq_kernel, awq_mean_latency (ms), exllamav2_mean_latency (ms), awq_p10, awq_p90, exllamav2_p10, exllamav2_p90, exllama_speedup"
print(header)
for line in lines:
print(line)
[pytest]
addopts=-s -v
log_cli=true
# ruff: noqa: I001
import unittest
import torch
import autogptq_cuda_64
import autogptq_cuda_256
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQLinear
try:
from awq import AutoAWQForCausalLM
except ModuleNotFoundError as e:
AutoAWQForCausalLM = None
AWQ_EXCEPTION = e
class TestAwqCompatibility(unittest.TestCase):
# TODO: test cuda-old fp16.
# TODO: test cuda-old fp32.
# TODO: test exllama v2.
def test_generation_cuda_old_fp32_pytorch(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch.float32,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_cuda_old_cuda_256(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
for torch_dtype in [torch.float16, torch.float32]:
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
for name, module in model_autogptq.named_modules():
if isinstance(module, CudaOldQLinear):
self.assertTrue(module.autogptq_cuda == autogptq_cuda_256)
if torch_dtype == torch.float32:
self.assertFalse(module.use_cuda_fp16)
else:
self.assertTrue(module.use_cuda_fp16)
inp = tokenizer(prompt, return_tensors="pt").to(device)
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_cuda_old_cuda_64(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
for torch_dtype in [torch.float16, torch.float32]:
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=True,
disable_exllamav2=True,
torch_dtype=torch_dtype,
)
# Force autogptq_cuda_64.
for name, module in model_autogptq.named_modules():
if isinstance(module, CudaOldQLinear):
if module.autogptq_cuda != autogptq_cuda_64:
module.autogptq_cuda = autogptq_cuda_64
if torch_dtype == torch.float32:
self.assertFalse(module.use_cuda_fp16)
else:
self.assertTrue(module.use_cuda_fp16)
inp = tokenizer(prompt, return_tensors="pt").to(device)
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_exllama(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
disable_exllama=False,
disable_exllamav2=True,
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
def test_generation_exllamav2(self):
if AutoAWQForCausalLM is None:
self.skipTest(f"AutoAWQ package (https://github.com/casper-hansen/AutoAWQ) is required to run this test. {AWQ_EXCEPTION}")
device = torch.device("cuda:0")
quant_path = "TheBloke/Llama-2-7B-Chat-AWQ"
model_autogptq = AutoGPTQForCausalLM.from_quantized(
quant_path,
device=device,
use_triton=False,
inject_fused_attention=False,
inject_fused_mlp=False,
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(quant_path)
prompt = "I am in Paris and I am going to see the"
inp = tokenizer(prompt, return_tensors="pt").to(device)
for name, submodule in model_autogptq.named_modules():
if isinstance(submodule, CudaOldQLinear):
# Just a hack to test the handmade pytorch implementation path.
submodule.autogptq_cuda_available = False
autogptq_output = model_autogptq.model.generate(**inp, num_beams=1, min_new_tokens=30, max_new_tokens=30)
autogptq_output = tokenizer.decode(autogptq_output[0])
model_awq = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=False)
awq_output = model_awq.generate(
**inp,
num_beams=1,
min_new_tokens=30,
max_new_tokens=30,
)
awq_output = tokenizer.decode(awq_output[0])
self.assertTrue(awq_output == autogptq_output)
import numpy as np
import math
import torch
import pytest
try:
import habana_frameworks.torch.core as htcore
except Exception as e:
pytestmark = pytest.mark.skip("Couldn't import HPU plugin, skipping HPU tests")
def _convert_to_tensor_list(tensor_or_tensors):
if isinstance(tensor_or_tensors, tuple):
return list(tensor_or_tensors)
elif isinstance(tensor_or_tensors, list):
return tensor_or_tensors
elif isinstance(tensor_or_tensors, torch.Tensor):
# You can't return list(tensor_or_tensors), because it will fail on 0-d tensors
result_list = []
result_list.append(tensor_or_tensors)
return result_list
else:
raise TypeError("Can not convert outputs")
def compare_tensors(hpu_tensors, cpu_tensors, atol, rtol, assert_enable=True):
hpu_tensors = _convert_to_tensor_list(hpu_tensors)
cpu_tensors = _convert_to_tensor_list(cpu_tensors)
assert len(hpu_tensors) == len(cpu_tensors)
hpu_tensors = [tensor.to('cpu') if tensor is not None else tensor for tensor in hpu_tensors]
for i in range(len(hpu_tensors)):
if cpu_tensors[i] is None and hpu_tensors[i] is None:
continue
hpu_tensors[i] = (
hpu_tensors[i].float()
if hpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
else hpu_tensors[i]
)
cpu_tensors[i] = (
cpu_tensors[i].float()
if cpu_tensors[i].dtype in [torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn]
else cpu_tensors[i]
)
if assert_enable:
np.testing.assert_allclose(
hpu_tensors[i].detach().numpy(),
cpu_tensors[i].detach().numpy(),
atol=atol,
rtol=rtol,
)
else:
print("hpu_result[{}]".format(i), hpu_tensors[i].detach().numpy())
print("cpu_result[{}]".format(i), cpu_tensors[i].detach().numpy())
return np.allclose(
hpu_tensors[i].detach().numpy(),
cpu_tensors[i].detach().numpy(),
atol=atol,
rtol=rtol,
equal_nan=True,
)
# taken from AutoGPTQ/tests/test_repacking.py
def gen_quant4(k, n, groupsize=-1, bias=False):
maxq = 2 ** 4 - 1
w = torch.randn((k, n), dtype=torch.bfloat16, device="cpu")
original_w = w.clone()
if groupsize != -1:
w = w.reshape((-1, groupsize, n))
w = w.permute(1, 0, 2)
w = w.reshape((groupsize, -1))
s = torch.max(torch.abs(w), 0, keepdim=True)[0]
s *= 2 / maxq
# Quantize.
w = torch.round(w / s).int()
# Unsigned storage.
w += (maxq + 1) // 2
w = torch.clamp(w, 0, maxq)
# Dequantize.
ref = (w - (maxq + 1) // 2).bfloat16() * s
if groupsize != -1:
def reshape(w):
w = w.reshape((groupsize, -1, n))
w = w.permute(1, 0, 2)
w = w.reshape((k, n)).contiguous()
return w
ref = reshape(ref)
w = reshape(w)
s = s.reshape((-1, n)).contiguous()
linear = torch.nn.Linear(k, n, bias=bias)
linear.weight.data = ref.t()
return original_w, linear, s
@pytest.mark.parametrize("bits", [4])
@pytest.mark.parametrize("group_size", [16, 32, 128])
@pytest.mark.parametrize("infeatures", [64, 128, 512, 4096, 11008])
@pytest.mark.parametrize("outfeatures", [64, 128, 512, 4096, 11008])
@pytest.mark.parametrize("bias", [True, False], ids=["bias", "no_bias"])
@pytest.mark.parametrize("scales_value, weight_value, zeros_value", [("normal", "normal", "normal"), ("normal", "normal", "range"), ("normal", "normal", "zeros"), ("ones", "zeros", "zeros"), ("ones", "zeros", "eights"), ("ones", "range", "zeros"), ("ones", "range", "ones"), ("ones", "7", "ones"), ("ones", "zeros", "range"),("ones", "zeros", "ones"), ("ones", "range", "range"), ("range", "range", "range"), ("range", "range", "zeros")])
@pytest.mark.parametrize("weight_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
def test_qlinear_hpu(bits, group_size, infeatures, outfeatures, bias, scales_value, weight_value, zeros_value, weight_dtype):
qweight_shape_0 = infeatures // 32 * bits
qzeros_shape_0 = math.ceil(infeatures / group_size)
qzeros_shape_1 = outfeatures // 32 * bits
if qweight_shape_0 == 0 or qzeros_shape_0 == 0 or qzeros_shape_1 == 0:
pytest.skip(f"{qweight_shape_0=} == 0 or {qzeros_shape_0=} == 0 or {qzeros_shape_1=} == 0")
if infeatures < group_size:
pytest.skip(f"{infeatures=} < {group_size=}")
if infeatures != outfeatures:
pytest.skip(f"{infeatures=} != {outfeatures=}")
trainable = False
use_cuda_fp16 = False
kernel_switch_threshold = 128
from auto_gptq.nn_modules.qlinear import qlinear_hpu, qlinear_cuda_old
quant_hpu = qlinear_hpu.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
# Cuda old implementation is the reference, also runs on hpu
quant_ref_cuda_old = qlinear_cuda_old.QuantLinear(bits=bits, group_size=group_size, infeatures=infeatures, outfeatures=outfeatures, bias=bias, use_cuda_fp16=use_cuda_fp16, kernel_switch_threshold=kernel_switch_threshold, trainable=trainable, weight_dtype=weight_dtype).to("hpu")
input = torch.rand((infeatures, outfeatures), dtype=weight_dtype).to("hpu")
_, linear, s = gen_quant4(infeatures, outfeatures, group_size, bias)
if scales_value == "ones":
s = torch.ones_like(s)
if scales_value == "range":
range_t = torch.tensor(list(range(1, s.numel()+1)), dtype=torch.int32)
shape_s = s.shape
s = (torch.ones(s.numel()) * range_t).reshape(shape_s).contiguous()
if weight_value == "ones":
linear.weight = torch.nn.Parameter(torch.ones_like(linear.weight))
elif weight_value == "zeros":
linear.weight = torch.nn.Parameter(torch.zeros_like(linear.weight))
elif weight_value == "range":
shape_w = linear.weight.shape
weight_local = torch.ones(shape_w, dtype=torch.int32)
range_t_weight = torch.tensor(list(range(0, 8)), dtype=torch.int32)
linear.weight = torch.nn.Parameter((torch.ones(weight_local.numel(), dtype=linear.weight.dtype).reshape(-1, 8) * range_t_weight).reshape(shape_w).contiguous())
elif weight_value.isnumeric():
linear.weight = torch.nn.Parameter(torch.full_like(linear.weight, int(weight_value)))
linear.weight = torch.nn.Parameter(linear.weight.to(weight_dtype))
if zeros_value == "zeros":
zeros = torch.full((infeatures // group_size, outfeatures), 0, dtype=torch.int32)
elif zeros_value == "range":
zeros = torch.ones((infeatures // group_size, outfeatures), dtype=torch.int32)
range_t_zeros = torch.tensor(list(range(1, 9)), dtype=torch.int32)
shape_z = zeros.shape
zeros = (torch.ones(zeros.numel(), dtype=torch.int32).reshape(-1, 8) * range_t_zeros).reshape(shape_z).contiguous()
elif zeros_value == "eights":
zeros = torch.full((infeatures // group_size, outfeatures), 8, dtype=torch.int32)
else:
zeros = torch.full((infeatures // group_size, outfeatures), 1, dtype=torch.int32)
htcore.mark_step()
quant_ref_cuda_old.pack(linear, s.clone().detach().T, zeros.clone().detach().T, g_idx=None)
htcore.mark_step()
quant_ref_cuda_old.to("hpu")
#TODO: pack independently
quant_hpu.set_packed(quant_ref_cuda_old)
htcore.mark_step()
quant_hpu.to("hpu")
out_ref_cuda_old = quant_ref_cuda_old(input)
htcore.mark_step()
quant_hpu.post_init()
htcore.mark_step()
out_hpu = quant_hpu(input)
htcore.mark_step()
out_ref_cuda_old = out_ref_cuda_old.cpu()
out_hpu = out_hpu.cpu()
compare_tensors(out_hpu.cpu(), out_ref_cuda_old.cpu(), rtol = 1e-05, atol = 1e-08)
import math
from unittest import TestCase
import torch.cuda.amp
from peft import TaskType
from peft.peft_model import PeftModelForCausalLM
from torch.optim import Adam
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.utils.peft_utils import (
GPTQAdaLoraConfig,
GPTQLoraConfig,
GPTQLoraLinear,
GPTQSVDLinear,
get_gptq_peft_model,
)
MODEL_NAME = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
class TestPeftConversion(TestCase):
def check_model_trainable(self, model_lora: PeftModelForCausalLM, tokenizer: AutoTokenizer) -> None:
batch = tokenizer("Hello, world", return_tensors="pt")
batch = {key: value.to(model_lora.device) for key, value in batch.items()}
batch["labels"] = batch["input_ids"]
batch["attention_mask"] = batch["attention_mask"].float()
batch["attention_mask"].requires_grad = True
model_lora.gradient_checkpointing_enable()
optimizer = Adam(model_lora.parameters(), lr=1e-4)
model_lora.train()
losses = []
for _ in range(30):
optimizer.zero_grad()
with torch.cuda.amp.autocast():
loss = model_lora(**batch).loss
losses.append(loss.item())
loss.backward()
optimizer.step()
self.assertTrue(losses[0] > losses[-1])
self.assertTrue(all(math.isfinite(loss) for loss in losses))
self.assertTrue(not any(math.isnan(loss) for loss in losses))
def test_lora_conversion(self):
model = AutoGPTQForCausalLM.from_quantized(
MODEL_NAME,
use_triton=False,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False,
use_safetensors=True,
)
peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
target_modules=["qkv_proj"],
)
model_lora = get_gptq_peft_model(
model,
peft_config,
adapter_name="test",
auto_find_all_linears=False,
train_mode=True,
)
linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
self.assertTrue(isinstance(linear_layer, GPTQLoraLinear))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
self.check_model_trainable(model_lora, tokenizer)
def test_adalora_conversion(self):
model = AutoGPTQForCausalLM.from_quantized(
MODEL_NAME,
use_triton=False,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False,
use_safetensors=True,
)
peft_config = GPTQAdaLoraConfig(
init_r=20,
target_r=16,
beta1=0.85,
beta2=0.85,
tinit=200,
tfinal=1000,
deltaT=10,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
target_modules=["qkv_proj"],
)
model_lora = get_gptq_peft_model(
model,
peft_config,
adapter_name="test",
auto_find_all_linears=False,
train_mode=True,
)
linear_layer = model_lora.base_model.model.model.layers[0].self_attn.qkv_proj
self.assertTrue(isinstance(linear_layer, GPTQSVDLinear))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
self.check_model_trainable(model_lora, tokenizer)
This diff is collapsed.
import os
import tempfile
import unittest
import torch.cuda
from parameterized import parameterized
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.quantization import CHECKPOINT_FORMAT, QUANT_CONFIG_FILENAME, BaseQuantizeConfig
class TestQuantization(unittest.TestCase):
@parameterized.expand([(False,), (True,)])
def test_quantize(self, use_marlin: bool):
pretrained_model_dir = "saibo/llama-1B"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
),
tokenizer(
"Today I am in Paris and it is a wonderful day."
),
]
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=False,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else CHECKPOINT_FORMAT.GPTQ,
)
model = AutoGPTQForCausalLM.from_pretrained(
pretrained_model_dir,
quantize_config=quantize_config,
use_flash_attention_2=False,
)
model.quantize(examples)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", use_marlin=use_marlin)
del model
torch.cuda.empty_cache()
# test compat: 1) with simple dict type 2) is_marlin_format
compat_quantize_config = {
"bits": 4,
"group_size": 128,
"desc_act": False,
"is_marlin_format": use_marlin,
}
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0", quantize_config=compat_quantize_config)
assert(isinstance(model.quantize_config, BaseQuantizeConfig))
del model
torch.cuda.empty_cache()
# test checkinpoint_format hint to from_quantized()
os.remove(f"{tmpdirname}/{QUANT_CONFIG_FILENAME}")
compat_quantize_config = {
"bits": 4,
"group_size": 128,
"desc_act": False,
}
model = AutoGPTQForCausalLM.from_quantized(tmpdirname, device="cuda:0",
quantize_config=compat_quantize_config,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN if use_marlin else None)
assert (isinstance(model.quantize_config, BaseQuantizeConfig))
import copy
import unittest
import autogptq_marlin_cuda
import torch
import torch.nn as nn
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear as CudaOldQuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
from auto_gptq.nn_modules.qlinear.qlinear_marlin import _get_perms, dequantize_weight
def gen_quant4(k, n, groupsize=-1):
maxq = 2 ** 4 - 1
w = torch.randn((k, n), dtype=torch.half, device="cpu")
original_w = w.clone()
if groupsize != -1:
w = w.reshape((-1, groupsize, n))
w = w.permute(1, 0, 2)
w = w.reshape((groupsize, -1))
s = torch.max(torch.abs(w), 0, keepdim=True)[0]
s *= 2 / maxq
# Quantize.
w = torch.round(w / s).int()
# Unsigned storage.
w += (maxq + 1) // 2
w = torch.clamp(w, 0, maxq)
# Dequantize.
ref = (w - (maxq + 1) // 2).half() * s
if groupsize != -1:
def reshape(w):
w = w.reshape((groupsize, -1, n))
w = w.permute(1, 0, 2)
w = w.reshape((k, n)).contiguous()
return w
ref = reshape(ref)
w = reshape(w)
s = s.reshape((-1, n)).contiguous()
linear = nn.Linear(k, n, bias=False)
linear.weight.data = ref.t()
return original_w, linear, s
original_w, linear, s = gen_quant4(64, 128)
class TestRepacking(unittest.TestCase):
def test_marlin_fast_repacking(self):
k = 2048
n = 1024
m = 5
group_size = 128
_, linear, s = gen_quant4(k, n, group_size)
cuda_old_linear = CudaOldQuantLinear(bits=4, group_size=group_size, infeatures=k, outfeatures=n, bias=False)
zeros = torch.full((k // group_size, n), 8, dtype=torch.int32)
cuda_old_linear.pack(linear, s.T, zeros.T, g_idx=None)
# Adapted from utils.marlin_utils.convert_to_marlin
dequantized_weight, dequantized_qzeros = dequantize_weight(cuda_old_linear)
dequantized_weight = dequantized_weight.to(torch.float16)
self.assertTrue(torch.all(dequantized_qzeros == 8))
linear_module = torch.nn.Linear(
in_features=k,
out_features=n,
bias=False,
dtype=torch.float16,
device="cuda",
)
linear_module.weight.data.copy_(linear.weight.data) # Not using dequantized_weight to avoid approx
# Create new linear method and copy to model.
marlin_linear = MarlinQuantLinear(
bits=4,
group_size=group_size,
infeatures=k,
outfeatures=n,
bias=False,
trainable=False,
)
marlin_linear.pack(linear_module.to("cuda"), scales=copy.deepcopy(cuda_old_linear.scales.data.t()).to("cuda"))
inp = torch.rand(m, k, dtype=torch.float16, device="cuda")
cuda_old_linear = cuda_old_linear.to("cuda")
marlin_linear = marlin_linear.to("cuda")
with torch.no_grad():
res_cuda_old = cuda_old_linear(inp)
res_marlin = marlin_linear(inp)
reldiff = (res_cuda_old - res_marlin).abs() / (res_cuda_old.abs() + 1e-12)
self.assertTrue(torch.mean(reldiff) < 4e-3)
weight_repacked = autogptq_marlin_cuda.gptq_repack(cuda_old_linear.qweight)
self.assertTrue(torch.allclose(weight_repacked, marlin_linear.B))
_, _scale_perm, _scale_perm_single = _get_perms()
s = cuda_old_linear.scales.data.clone()
if group_size != k:
s = s.reshape((1, -1))
s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
else:
s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
s = s.reshape((-1, n)).contiguous()
self.assertTrue(torch.allclose(s, marlin_linear.s))
import json
import os
import tempfile
import time
import unittest
from auto_gptq import AutoGPTQForCausalLM
from auto_gptq.quantization import CHECKPOINT_FORMAT, CHECKPOINT_FORMAT_FIELD, QUANT_CONFIG_FILENAME
from auto_gptq.quantization.config import QUANT_METHOD, BaseQuantizeConfig
class TestSerialization(unittest.TestCase):
MODEL_ID = "habanoz/TinyLlama-1.1B-Chat-v0.3-GPTQ"
def setUp(self):
dummy_config = BaseQuantizeConfig(
model_name_or_path=self.MODEL_ID,
quant_method=QUANT_METHOD.GPTQ,
checkpoint_format=CHECKPOINT_FORMAT.MARLIN)
model_cache_path, is_cached = dummy_config.get_cache_file_path()
if is_cached:
os.remove(model_cache_path)
def test_marlin_local_serialization(self):
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
end = time.time()
first_load_time = end - start
with tempfile.TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
self.assertTrue(os.path.isfile(os.path.join(tmpdir, "model.safetensors")))
model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
self.assertFalse(os.path.isfile(os.path.join(tmpdir, model_cache_path)))
with open(os.path.join(tmpdir, QUANT_CONFIG_FILENAME), "r") as config_file:
config = json.load(config_file)
self.assertTrue(config[CHECKPOINT_FORMAT_FIELD] == CHECKPOINT_FORMAT.MARLIN)
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(tmpdir, device="cuda:0", use_marlin=True)
end = time.time()
second_load_time = end - start
# Since we use a CUDA kernel to repack weights, the first load time is already small.
self.assertTrue(second_load_time < first_load_time)
def test_marlin_hf_cache_serialization(self):
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
end = time.time()
first_load_time = end - start
model_cache_path, is_cached = model.quantize_config.get_cache_file_path()
self.assertTrue("assets" in model_cache_path)
self.assertTrue(is_cached)
start = time.time()
model = AutoGPTQForCausalLM.from_quantized(self.MODEL_ID, device="cuda:0", use_marlin=True)
self.assertTrue(model.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN)
end = time.time()
second_load_time = end - start
# Since we use a CUDA kernel to repack weights, the first load time is already small.
self.assertTrue(second_load_time < first_load_time)
import unittest
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
class TestShardedLoading(unittest.TestCase):
def test_loading(self):
model_name = "TheBlokeAI/llama-68m-GPTQ-sharded"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(model_name, device='cuda:0',)
tokens = model.generate(**tokenizer("1337", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
result = tokenizer.decode(tokens)
self.assertTrue(result == '<s> 133777777777777777777777')
def test_loading_large(self):
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4")
model = AutoGPTQForCausalLM.from_quantized("Qwen/Qwen1.5-7B-Chat-GPTQ-Int4", device='cuda:0')
tokens = model.generate(**tokenizer("Today I am in Paris and", return_tensors="pt").to(model.device), max_new_tokens=20)[0]
result = tokenizer.decode(tokens)
self.assertTrue(result == 'Today I am in Paris and I am going to the Louvre Museum. I want to see the Mona Lisa painting, but I')
import os
import unittest
import torch
import torch.utils.benchmark as benchmark
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
MODEL_ID = "TheBloke/Llama-7B-GPTQ"
DATASET_ID = "timdettmers/openassistant-guanaco"
LEARNING_RATE = 3e-5
MAX_SEQ_LEN = 10
BATCH_SIZE = 5
NUM_TRAIN_STEPS = 10
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def benchmark_forward(
fn,
*inputs,
repeats="auto",
desc="",
verbose=True,
amp=False,
amp_dtype=torch.float16,
**kwinputs,
):
if verbose:
print(desc, "- Forward pass")
def amp_wrapper(*inputs, **kwinputs):
with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
fn(*inputs, **kwinputs)
t = benchmark.Timer(
stmt="fn_amp(*inputs, **kwinputs)",
globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
num_threads=torch.get_num_threads(),
)
if repeats == "auto":
m = t.blocked_autorange()
else:
m = t.timeit(repeats)
if verbose:
print(m)
return t, m
def get_model_and_tokenizer(
model_id=MODEL_ID,
inject_fused_attention=False,
inject_fused_mlp=False,
**model_kwargs,
):
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
use_fast=True,
)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoGPTQForCausalLM.from_quantized(
model_id,
trainable=True,
inject_fused_attention=inject_fused_attention,
inject_fused_mlp=inject_fused_mlp,
disable_exllamav2=True,
disable_exllama=True,
**model_kwargs,
)
model.warmup_triton()
return model, tokenizer
class TestTriton(unittest.TestCase):
def test_triton_qlinear(self):
ref_model, _ = get_model_and_tokenizer(
model_id=MODEL_ID,
use_triton=True,
inject_fused_attention=False,
inject_fused_mlp=False,
)
test_model, _ = get_model_and_tokenizer(
model_id=MODEL_ID,
use_tritonv2=True,
inject_fused_attention=False,
inject_fused_mlp=False,
)
hidden_size = ref_model.model.model.embed_tokens.weight.shape[1]
test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda()
qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
qlinear_test = test_model.model.model.layers[0].self_attn.q_proj
test_out = qlinear_test(test_data)
ref_out = qlinear_ref(test_data)
self.assertTrue(torch.allclose(test_out, ref_out))
_, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)
_, measure_tritonv2 = benchmark_forward(qlinear_test, test_data, desc="Triton-v2", verbose=True)
self.assertTrue(measure_tritonv2.mean < measure_triton.mean)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment