Commit da900c3b authored by yangql's avatar yangql
Browse files

Initial commit

parents
import copy
import logging
import os
from os.path import isdir, join
from typing import Dict, List, Optional, Union
import accelerate
import torch
import torch.nn as nn
import transformers
from accelerate.hooks import remove_hook_from_module
from safetensors import safe_open
from safetensors.torch import load_file as safe_load
from safetensors.torch import save_file as safe_save
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
from transformers.modeling_utils import no_init_weights
from transformers.utils.generic import ContextManagers
from transformers.utils.hub import (
CommitOperationAdd,
PushToHubMixin,
create_commit,
create_repo,
)
from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
from ..nn_modules.qlinear import GeneralQuantLinear
from ..quantization import GPTQ, BaseQuantizeConfig
from ..quantization.config import (
CHECKPOINT_FORMAT,
CHECKPOINT_FORMAT_FIELD,
QUANT_METHOD_FIELD,
QUANTIZE_BLACK_LIST,
)
from ..utils.accelerate_utils import load_checkpoint_in_model
from ..utils.data_utils import collate_data
from ..utils.import_utils import (
AUTOGPTQ_CUDA_AVAILABLE,
EXLLAMA_KERNELS_AVAILABLE,
EXLLAMAV2_KERNELS_AVAILABLE,
MARLIN_AVAILABLE,
QIGEN_AVAILABLE,
TRITON_AVAILABLE,
dynamically_import_QuantLinear,
)
from ..utils.marlin_utils import (
_validate_marlin_compatibility,
_validate_marlin_device_support,
prepare_model_for_marlin_load,
)
from ._const import CPU, CUDA_0, SUPPORTED_MODELS
from ._utils import (
autogptq_post_init,
find_layers,
get_checkpoints,
get_device,
get_module_by_name_prefix,
get_module_by_name_suffix,
make_quant,
make_sure_no_tensor_in_meta_device,
move_to_device,
pack_from_tensors,
pack_model,
preprocess_checkpoint_qigen,
simple_dispatch_model,
unpack_awq,
)
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter("%(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.propagate = False
logger.addHandler(handler)
logger.setLevel(logging.INFO)
def nested_move_to_device(v, device):
if isinstance(v, torch.Tensor):
return move_to_device(v, device)
elif isinstance(v, (list, tuple)):
return type(v)([nested_move_to_device(e, device) for e in v])
else:
return v
class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
layer_type: str = None
layers_block_name: str = None
outside_layer_modules: List[str] = None
inside_layer_modules: List[List[str]] = None
lm_head_name: str = "lm_head"
fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
def __init__(
self,
model: PreTrainedModel,
quantized: bool,
quantize_config: BaseQuantizeConfig,
is_triton_backend: bool = False,
injected_fused_attention: bool = False,
injected_fused_mlp: bool = False,
trainable: bool = False,
):
super().__init__()
self.model = model
self.model_type = self.model.config.model_type
self._quantized = quantized
self.quantize_config = quantize_config
self.config = self.model.config
self.is_triton_backend = is_triton_backend
self.injected_fused_attention = injected_fused_attention
self.injected_fused_mlp = injected_fused_mlp
self.trainable = trainable
@property
def quantized(self):
return self._quantized
@property
def hf_device_map(self):
return getattr(self.model, "hf_device_map", None)
def _prepare_examples_for_quantization(
self,
examples: List[Dict[str, Union[List[int], torch.LongTensor]]],
batch_size: int = 1,
):
def _convert_tensor_to_list(tensor):
if isinstance(tensor, torch.Tensor):
if len(tensor.shape) == 1:
tensor = tensor.unsqueeze(0)
tensor = tensor.long()
return tensor.cpu().numpy().tolist()
return [tensor]
new_examples = []
for example in examples:
input_ids = _convert_tensor_to_list(example["input_ids"])
attention_mask = _convert_tensor_to_list(example["attention_mask"])
if "labels" in example:
labels = _convert_tensor_to_list(example["labels"])
elif "label" in example:
labels = _convert_tensor_to_list(example["label"])
elif "label_ids" in example:
labels = _convert_tensor_to_list(example["label_ids"])
else:
labels = copy.deepcopy(input_ids)
new_examples.append(
{
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
)
pad_token_id = self.config.pad_token_id
if not pad_token_id:
pad_token_id = self.config.eos_token_id
new_examples = [
collate_data(new_examples[start : start + batch_size], pad_token_id)
for start in range(0, len(new_examples), batch_size)
]
for new_example in new_examples:
del new_example["labels"]
return new_examples
@torch.inference_mode()
def quantize(
self,
examples: List[Dict[str, Union[List[int], torch.LongTensor]]],
batch_size: int = 1,
use_triton: bool = False,
use_cuda_fp16: bool = True,
autotune_warmup_after_quantized: bool = False,
cache_examples_on_gpu: bool = True,
):
if self.quantized:
raise EnvironmentError("can't execute quantize because the model is quantized.")
if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST:
raise ValueError(f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}")
if use_triton and not TRITON_AVAILABLE:
logger.warning("triton is not installed, reset use_triton to False")
use_triton = False
device_map = self.hf_device_map
if device_map:
for name, device in device_map.items():
if device == "cpu":
logger.info(f"truly offloading {name} to cpu with hook.")
module = get_module_by_name_suffix(self.model, name)
remove_hook_from_module(module, recurse=True)
accelerate.cpu_offload_with_hook(module, CUDA_0)
layer_inputs = []
attention_masks = []
position_ids = []
layer_input_kwargs = []
layer_outputs = []
examples = self._prepare_examples_for_quantization(examples, batch_size)
forward_pass_use_cache = self.model.config.use_cache
self.model.config.use_cache = False
num_batches = len(examples)
layers = get_module_by_name_prefix(self.model, self.layers_block_name)
cur_layer_device = get_device(layers[0])
data_device = cur_layer_device if cache_examples_on_gpu else CPU
def store_input_hook(_, args, kwargs):
# Positional arguments.
layer_input = []
for inp in args:
layer_input.append(move_to_device(inp, data_device))
layer_inputs.append(layer_input)
# Keyword arguments.
if kwargs["attention_mask"] is not None:
attention_masks.append(kwargs["attention_mask"].to(data_device))
else:
attention_masks.append(None)
pos_ids = kwargs.get("position_ids", None)
if pos_ids is not None:
position_ids.append(move_to_device(pos_ids, data_device))
one_kwargs = {}
for (
k,
v,
) in kwargs.items(): # make sure other arguments also be captured
if k not in ["hidden_states", "attention_mask", "position_ids"]:
one_kwargs[k] = nested_move_to_device(v, data_device)
layer_input_kwargs.append(one_kwargs)
raise ValueError
force_layer_back_to_cpu = False
if get_device(layers[0]) == CPU:
layers[0] = layers[0].to(CUDA_0)
force_layer_back_to_cpu = True
ori_outside_layer_module_devices = {}
for module_name in self.outside_layer_modules:
module = get_module_by_name_prefix(self.model, module_name)
if module is None:
continue
ori_outside_layer_module_devices[module_name] = get_device(module)
if module is not None:
move_to_device(module, cur_layer_device)
# TODO: make this optional, backporting https://github.com/huggingface/optimum/blob/main/optimum/gptq/quantizer.py
handle = layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
for example in examples:
for k, v in example.items():
if len(v.shape) == 1:
v = v.unsqueeze(0)
example[k] = move_to_device(v, cur_layer_device)
try:
self.model(**example)
except ValueError:
pass
handle.remove()
move_to_device(layers[0], CPU if force_layer_back_to_cpu else cur_layer_device)
for module_name in self.outside_layer_modules:
module = get_module_by_name_prefix(self.model, module_name)
if module is not None:
move_to_device(module, ori_outside_layer_module_devices[module_name])
torch.cuda.empty_cache()
inside_layer_modules = self.inside_layer_modules
if not self.quantize_config.true_sequential:
inside_layer_modules = [sum(inside_layer_modules, [])]
quantizers = {}
for i in range(len(layers)):
logger.info(f"Start quantizing layer {i + 1}/{len(layers)}")
layer = layers[i]
force_layer_back_to_cpu = False
if get_device(layer) == CPU:
move_to_device(layer, CUDA_0)
force_layer_back_to_cpu = True
cur_layer_device = get_device(layer)
full = find_layers(layer)
for names in inside_layer_modules:
subset = {n: full[n] for n in names if n in full}
gptq = {}
for name in subset:
gptq[name] = GPTQ(subset[name])
gptq[name].quantizer.configure(
self.quantize_config.bits,
perchannel=True,
sym=self.quantize_config.sym,
mse=False,
)
def add_batch(name):
def tmp(_, inp, out):
# gptq is mutable.
gptq[name].add_batch(inp[0].data, out.data) # noqa: F821
return tmp
handles = []
for name in subset:
handles.append(subset[name].register_forward_hook(add_batch(name)))
for j in range(num_batches):
layer_input = []
for k, layer_inp in enumerate(layer_inputs[j]):
layer_input.append(move_to_device(layer_inp, cur_layer_device))
layer_attention_mask = move_to_device(attention_masks[j], cur_layer_device)
additional_layer_inputs = {"attention_mask": layer_attention_mask}
layer_position_ids = (
None if not position_ids else move_to_device(position_ids[j], cur_layer_device)
)
if layer_position_ids is not None:
additional_layer_inputs["position_ids"] = layer_position_ids
for k, v in layer_input_kwargs[j].items():
additional_layer_inputs[k] = nested_move_to_device(v, cur_layer_device)
layer(*layer_input, **additional_layer_inputs)
for h in handles:
h.remove()
for name in subset:
logger.info(f"Quantizing {name} in layer {i + 1}/{len(layers)}...")
scale, zero, g_idx = gptq[name].fasterquant(
percdamp=self.quantize_config.damp_percent,
group_size=self.quantize_config.group_size,
actorder=self.quantize_config.desc_act,
static_groups=self.quantize_config.static_groups,
)
quantizers[f"{self.layers_block_name}.{i}.{name}"] = (
gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device),
move_to_device(scale, CPU if force_layer_back_to_cpu else cur_layer_device),
move_to_device(zero, CPU if force_layer_back_to_cpu else cur_layer_device),
move_to_device(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device),
)
gptq[name].free()
for j in range(num_batches):
layer_input = []
for k, layer_inp in enumerate(layer_inputs[j]):
layer_input.append(move_to_device(layer_inp, cur_layer_device))
layer_attention_mask = move_to_device(attention_masks[j], cur_layer_device)
additional_layer_inputs = {"attention_mask": layer_attention_mask}
layer_position_ids = None if not position_ids else move_to_device(position_ids[j], cur_layer_device)
if layer_position_ids is not None:
additional_layer_inputs["position_ids"] = layer_position_ids
for k, v in layer_input_kwargs[j].items():
additional_layer_inputs[k] = nested_move_to_device(v, cur_layer_device)
layer_output = move_to_device(
layer(*layer_input, **additional_layer_inputs)[0],
cur_layer_device if cache_examples_on_gpu else CPU,
)
layer_outputs.append([layer_output])
layers[i] = move_to_device(layer, CPU if force_layer_back_to_cpu else cur_layer_device)
del layer
del gptq
del layer_inputs
layer_inputs, layer_outputs = layer_outputs, [] # TODO: is it really OK to cache only the first positional argument?
torch.cuda.empty_cache()
pack_model(
model=self.model,
quantizers=quantizers,
bits=self.quantize_config.bits,
group_size=self.quantize_config.group_size,
use_triton=use_triton,
use_cuda_fp16=use_cuda_fp16,
desc_act=self.quantize_config.desc_act,
warmup_triton=autotune_warmup_after_quantized,
force_layer_back_to_cpu=force_layer_back_to_cpu,
use_marlin=self.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN,
)
if device_map:
self.model = remove_hook_from_module(self.model, recurse=True)
self.model = simple_dispatch_model(self.model, device_map)
self.model.config.use_cache = forward_pass_use_cache
self._quantized = True
torch.cuda.empty_cache()
@property
def device(self):
if not self.hf_device_map:
return self.model.device
else:
device = [d for d in self.hf_device_map.values() if d not in {"disk"}][0]
return torch.device(device)
def to(self, device: Union[str, torch.device]):
self.model.to(device)
return self
def forward(self, *args, **kwargs):
return self.model(*args, **kwargs)
def generate(self, **kwargs):
"""shortcut for model.generate"""
with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
return self.model.generate(**kwargs)
def prepare_inputs_for_generation(self, *args, **kwargs):
"""shortcut for model.prepare_inputs_for_generation"""
return self.model.prepare_inputs_for_generation(*args, **kwargs)
def push_to_hub(
self,
repo_id: str,
save_dir: Optional[str] = None,
use_safetensors: Optional[bool] = True,
safetensors_metadata: Optional[Dict[str, str]] = None,
commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
use_auth_token: Optional[Union[bool, str]] = None,
private: Optional[bool] = None,
token: Optional[Union[bool, str]] = None,
create_pr: Optional[bool] = False,
) -> str:
"""
Upload the model to the Hugging Face Hub.
Parameters:
repo_id (`str`):
The name of the repository you want to push your tool to. It should contain your organization name when
pushing to a given organization.
save_dir (`str`, *optional*):
The name of the local folder to save the model to.
If the model has already been saved, this parameter can be omitted.
use_safetensors (`bool`, *optional*):
Save the model using `safetensors`.
If the model has already been saved, this parameter can be omitted.
safetensors_metadata: (`dict`, *optional*, defaults to `None`):
Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
Metadata is optional and is purely for informational purposes. It does not affect inference.
If `None`, no metadata will be saved.
commit_message (`str`, *optional*, defaults to `"Upload tool"`):
Message to commit while pushing.
use_auth_token (`bool` or `str`, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
is not specified.
private (`bool`, *optional*):
Whether or not the repository created should be private.
token (`bool` or `str`, *optional*):
The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated
when running `huggingface-cli login` (stored in `~/.huggingface`).
create_pr (`bool`, *optional*, defaults to `False`):
Whether or not to create a PR with the uploaded files or directly commit.
"""
if (
self.quantize_config.model_name_or_path is None or not isdir(self.quantize_config.model_name_or_path)
) and save_dir is None:
raise ValueError(
"Quantized model should be saved first, or you can provide save_dir to make sure model is saved to local disk before uploading."
)
if save_dir is not None:
logger.info(f"Saving model to {save_dir}")
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
repo_url = create_repo(
repo_id=repo_id,
token=token,
private=private,
exist_ok=True,
repo_type="model",
)
repo_id = repo_url.repo_id
if self.quantize_config.model_name_or_path is not None:
work_dir = self.quantize_config.model_name_or_path
operations = [
CommitOperationAdd(path_or_fileobj=join(work_dir, f), path_in_repo=f) for f in os.listdir(work_dir)
]
logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}")
return create_commit(
repo_id=repo_id,
operations=operations,
commit_message=commit_message,
token=use_auth_token,
create_pr=create_pr,
repo_type="model",
)
def save_quantized(
self,
save_dir: str,
use_safetensors: bool = True,
safetensors_metadata: Optional[Dict[str, str]] = None,
):
"""save quantized model and configs to local disk"""
os.makedirs(save_dir, exist_ok=True)
if not self.quantized:
raise EnvironmentError("can only save quantized model, please execute .quantize first.")
self.model.to(CPU)
model_base_name = (
self.quantize_config.model_file_base_name
or f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
)
if use_safetensors:
model_save_name = model_base_name + ".safetensors"
state_dict = self.model.state_dict()
state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
if safetensors_metadata is None:
safetensors_metadata = {}
elif not isinstance(safetensors_metadata, dict):
raise TypeError("safetensors_metadata must be a dictionary.")
else:
logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
new_safetensors_metadata = {}
converted_keys = False
for key, value in safetensors_metadata.items():
if not isinstance(key, str) or not isinstance(value, str):
converted_keys = True
try:
new_key = str(key)
new_value = str(value)
except Exception as e:
raise TypeError(
f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
)
if new_key in new_safetensors_metadata:
logger.warning(
f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
)
new_safetensors_metadata[new_key] = new_value
safetensors_metadata = new_safetensors_metadata
if converted_keys:
logger.debug(
f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
)
# Format is required to enable Accelerate to load the metadata
# otherwise it raises an OSError
safetensors_metadata["format"] = "pt"
# Store the quantization configuration as safetensors metadata
from auto_gptq import __version__
safetensors_metadata["auto_gptq_version"] = str(__version__)
safetensors_metadata["gptq_bits"] = str(self.quantize_config.bits)
safetensors_metadata["gptq_group_size"] = str(self.quantize_config.group_size)
safetensors_metadata["gptq_desc_act"] = str(self.quantize_config.desc_act)
safetensors_metadata["gptq_damp_percent"] = str(self.quantize_config.damp_percent)
safetensors_metadata["gptq_" + CHECKPOINT_FORMAT_FIELD] = self.quantize_config.checkpoint_format
safetensors_metadata["gptq_" + QUANT_METHOD_FIELD] = self.quantize_config.quant_method
safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
else:
model_save_name = model_base_name + ".bin"
torch.save(self.model.state_dict(), join(save_dir, model_save_name))
self.model.config.quantization_config = self.quantize_config.to_dict()
self.model.config.save_pretrained(save_dir)
self.quantize_config.save_pretrained(save_dir)
self.quantize_config.model_name_or_path = save_dir
self.quantize_config.model_file_base_name = model_base_name
def save_pretrained(
self,
save_dir: str,
use_safetensors: bool = True,
safetensors_metadata: Optional[Dict[str, str]] = None,
**kwargs,
):
"""alias of save_quantized"""
logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
quantize_config: BaseQuantizeConfig,
max_memory: Optional[dict] = None,
trust_remote_code: bool = False,
torch_dtype: torch.dtype = torch.float16,
**model_init_kwargs,
):
"""load un-quantized pretrained model to cpu"""
if not torch.cuda.is_available():
raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.")
def skip(*args, **kwargs):
pass
torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
# Parameters related to loading from Hugging Face Hub
cache_dir = model_init_kwargs.pop("cache_dir", None)
force_download = model_init_kwargs.pop("force_download", False)
resume_download = model_init_kwargs.pop("resume_download", False)
proxies = model_init_kwargs.pop("proxies", None)
local_files_only = model_init_kwargs.pop("local_files_only", False)
use_auth_token = model_init_kwargs.pop("use_auth_token", None)
revision = model_init_kwargs.pop("revision", None)
subfolder = model_init_kwargs.pop("subfolder", "")
commit_hash = model_init_kwargs.pop("_commit_hash", None)
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
"_commit_hash": commit_hash,
}
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs
)
if config.model_type not in SUPPORTED_MODELS:
raise TypeError(f"{config.model_type} isn't supported yet.")
# enforce some values despite user specified
model_init_kwargs["torch_dtype"] = torch_dtype
model_init_kwargs["trust_remote_code"] = trust_remote_code
if max_memory:
if "disk" in max_memory:
raise NotImplementedError("disk offload not support yet.")
with accelerate.init_empty_weights():
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
model.tie_weights()
max_memory = accelerate.utils.get_balanced_memory(
model,
max_memory=max_memory,
no_split_module_classes=[cls.layer_type],
dtype=model_init_kwargs["torch_dtype"],
low_zero=False,
)
model_init_kwargs["device_map"] = accelerate.infer_auto_device_map(
model,
max_memory=max_memory,
no_split_module_classes=[cls.layer_type],
dtype=model_init_kwargs["torch_dtype"],
)
model_init_kwargs["low_cpu_mem_usage"] = True
del model
else:
model_init_kwargs["device_map"] = None
model_init_kwargs["low_cpu_mem_usage"] = False
torch.cuda.empty_cache()
merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
model_config = model.config.to_dict()
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
if any(k in model_config for k in seq_len_keys):
for key in seq_len_keys:
if key in model_config:
model.seqlen = model_config[key]
break
else:
logger.warning("can't get model's sequence length from model config, will set to 4096.")
model.seqlen = 4096
model.eval()
return cls(model, False, quantize_config)
@classmethod
def from_quantized(
cls,
model_name_or_path: Optional[str],
device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
max_memory: Optional[dict] = None,
device: Optional[Union[str, int]] = None,
low_cpu_mem_usage: bool = False,
use_triton: bool = False,
use_qigen: bool = False,
use_marlin: bool = False,
torch_dtype: Optional[torch.dtype] = None,
inject_fused_attention: bool = False,
inject_fused_mlp: bool = False,
use_cuda_fp16: bool = True,
quantize_config: Optional[BaseQuantizeConfig] = None,
model_basename: Optional[str] = None,
use_safetensors: bool = True,
trust_remote_code: bool = False,
warmup_triton: bool = False,
trainable: bool = False,
disable_exllama: Optional[bool] = None,
disable_exllamav2: bool = False,
use_tritonv2: bool = False,
checkpoint_format: Optional[str] = None,
**kwargs,
):
"""load quantized model from local disk"""
# If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
if disable_exllama is None:
if disable_exllamav2:
disable_exllama = False
else:
disable_exllama = True
# Parameters related to loading from Hugging Face Hub
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
use_auth_token = kwargs.pop("use_auth_token", None)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", "")
commit_hash = kwargs.pop("_commit_hash", None)
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
"_raise_exceptions_for_missing_entries": False,
"_commit_hash": commit_hash,
}
if use_qigen and not QIGEN_AVAILABLE:
logger.warning("Qigen is not installed, reset use_qigen to False.")
use_qigen = False
if use_triton and use_tritonv2:
logging.warn(
"Both use_triton and use_tritonv2 are set to True. Defaulting to use_triton"
)
use_tritonv2 = False
if (use_triton or use_tritonv2) and not TRITON_AVAILABLE:
logger.warning("Triton is not installed, reset use_triton to False.")
use_triton = False
use_tritonv2 = False
if not disable_exllama and not EXLLAMA_KERNELS_AVAILABLE:
logger.warning(
"Exllama kernel is not installed, reset disable_exllama to True. "
"This may because you installed auto_gptq using a pre-build wheel "
"on Windows, in which exllama_kernels are not compiled. To use "
"exllama_kernels to further speedup inference, you can re-install "
"auto_gptq from source."
)
disable_exllama = True
if not disable_exllamav2 and not EXLLAMAV2_KERNELS_AVAILABLE:
logger.warning(
"Exllamav2 kernel is not installed, reset disable_exllamav2 to True. "
"This may because you installed auto_gptq using a pre-build wheel "
"on Windows, in which exllama_kernels are not compiled. To use "
"exllama_kernels to further speedup inference, you can re-install "
"auto_gptq from source."
)
disable_exllamav2 = True
if not AUTOGPTQ_CUDA_AVAILABLE:
logger.warning(
"CUDA kernels for auto_gptq are not installed, this will result in "
"very slow inference speed. This may because:\n"
"1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.\n"
"2. You are using pytorch without CUDA support.\n"
"3. CUDA and nvcc are not installed in your device."
)
if use_qigen and QIGEN_AVAILABLE:
logger.warning("QIgen is active. Ignores all settings related to cuda.")
inject_fused_attention = False
inject_fused_mlp = False
use_triton = False
disable_exllama = False
disable_exllamav2 = True
if not disable_exllamav2 and not disable_exllama:
logger.warning(
"You have activated both exllama and exllamav2 kernel. Setting disable_exllama to True and keeping disable_exllamav2 to False"
)
disable_exllama = True
# == step1: prepare configs and file names == #
config = AutoConfig.from_pretrained(
model_name_or_path,
trust_remote_code=trust_remote_code,
**cached_file_kwargs,
)
if config.model_type not in SUPPORTED_MODELS:
raise TypeError(f"{config.model_type} isn't supported yet.")
if quantize_config is None:
quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path, checkpoint_format=checkpoint_format, **cached_file_kwargs, **kwargs)
else:
if not isinstance(quantize_config, BaseQuantizeConfig):
quantize_config = BaseQuantizeConfig.from_quant_config(quantize_config, checkpoint_format)
if quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN:
# format marlin requires marlin kernel
use_marlin = True
marlin_compatible = _validate_marlin_device_support()
if use_marlin and not MARLIN_AVAILABLE:
raise TypeError("use_marlin is true but Marlin is not available due to cuda/device support.")
if not use_marlin and MARLIN_AVAILABLE:
unsupported_reason = _validate_marlin_compatibility(quantize_config)
if unsupported_reason is None and marlin_compatible:
logger.info(
"You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`."
)
if model_basename is None:
if quantize_config.model_file_base_name:
possible_model_basenames = [quantize_config.model_file_base_name]
else:
possible_model_basenames = [
f"gptq_model-{quantize_config.bits}bit-{quantize_config.group_size}g",
"model",
]
else:
possible_model_basenames = [model_basename]
quantize_config.model_name_or_path = model_name_or_path
extensions = []
if use_safetensors:
extensions.append(".safetensors")
else:
extensions += [".bin", ".pt"]
model_name_or_path = str(model_name_or_path)
# Retrieve (and if necessary download) the quantized checkpoint(s).
is_sharded, resolved_archive_file, true_model_basename = get_checkpoints(model_name_or_path=model_name_or_path, extensions=extensions, possible_model_basenames=possible_model_basenames, **cached_file_kwargs)
quantize_config.model_file_base_name = true_model_basename
model_save_name = resolved_archive_file # In case a model is sharded, this would be `model.safetensors.index.json` which may later break.
if (not disable_exllama or not disable_exllamav2) and trainable:
logger.warning(
"QuantLinear with the exllama backend not does support the trainable mode yet, switching to cuda/cuda_old/triton backend."
)
disable_exllama = True
disable_exllamav2 = True
elif not (use_triton or use_tritonv2) and trainable:
logger.warning(
"QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend."
)
# == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
def skip(*args, **kwargs):
pass
if torch_dtype is None:
if not use_qigen:
torch_dtype = torch.float16
else:
torch_dtype = torch.float32
if torch_dtype != torch.float16:
logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.")
use_cuda_fp16 = False
if not use_qigen:
torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
transformers.modeling_utils._init_weights = False
init_contexts = [no_init_weights()]
if low_cpu_mem_usage:
init_contexts.append(accelerate.init_empty_weights(include_buffers=False))
with ContextManagers(init_contexts):
model = AutoModelForCausalLM.from_config(
config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
)
layers = find_layers(model)
ignore_layers = [cls.lm_head_name] + cls.outside_layer_modules
for name in list(layers.keys()):
if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all(
not name.endswith(ignore_layer)
for sublist in cls.inside_layer_modules
for ignore_layer in sublist
):
logger.info(f"The layer {name} is not quantized.")
del layers[name]
make_quant(
model,
layers,
quantize_config.bits,
quantize_config.group_size,
use_triton=use_triton,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_cuda_fp16=use_cuda_fp16,
desc_act=quantize_config.desc_act,
trainable=trainable,
use_tritonv2=use_tritonv2,
)
model.tie_weights()
# == step3: load checkpoint and dispatch == #
if isinstance(device_map, str) and device_map not in [
"auto",
"balanced",
"balanced_low_0",
"sequential",
]:
raise ValueError(
"If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
"'sequential'."
)
if isinstance(device_map, dict):
max_memory = None
else:
if device is None and not device_map and not max_memory:
device_map = "auto"
if device is not None:
device = torch.device(device)
if not max_memory and not device_map:
device_map = {"": device.index if device.type == "cuda" else device.type}
if not isinstance(device_map, dict) and device_map != "sequential":
max_memory = accelerate.utils.get_balanced_memory(
model=model,
max_memory=max_memory,
no_split_module_classes=[cls.layer_type],
low_zero=(device_map == "balanced_low_0"),
)
if not isinstance(device_map, dict):
device_map = accelerate.infer_auto_device_map(
model,
max_memory=max_memory,
no_split_module_classes=[cls.layer_type],
)
if low_cpu_mem_usage:
make_sure_no_tensor_in_meta_device(
model,
use_triton,
quantize_config.desc_act,
quantize_config.group_size,
bits=quantize_config.bits,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_tritonv2=use_tritonv2,
)
# TODO: move this logic in an awq_utils.py file.
if quantize_config.checkpoint_format == CHECKPOINT_FORMAT.AWQ_GEMM:
if is_sharded:
raise ValueError("The loading of sharded checkpoints with AWQ checkpoints is currently not supported. Please raise an issue in AutoGPTQ repository.")
if use_marlin:
raise ValueError(
"Tried to load an AWQ model with use_marlin=True. This is currently not supported. Please open an issue in AutoGPTQ repository."
)
model_cache_name, is_cached = quantize_config.get_cache_file_path()
if is_cached:
model_save_name = model_cache_name
logger.info(f"Loading an AWQ model, detected a cached repacked weight at {model_save_name}.")
else:
logger.info(
"Loading an AWQ model. This requires repacking the weights, and no cached repacked checkpoint was found. Grab a coffee!"
)
if "safetensors" not in model_save_name:
raise NotImplementedError(
f"Conversion from AWQ checkpoints is implemented only for safetensors checkpoints, found {model_save_name}"
)
if quantize_config.bits != 4:
raise NotImplementedError(
f"Conversion from AWQ checkpoints is supported only for 4 bits models. Found {quantize_config.bits} bits."
)
gptq_layers = set()
non_gptq_params = set()
with safe_open(model_save_name, framework="pt") as f:
for state_dict_key in f.keys():
if (
"qweight" not in state_dict_key
and "qzeros" not in state_dict_key
and "scales" not in state_dict_key
):
non_gptq_params.add(state_dict_key)
continue
# e.g. prefix "model.layers.3.self_attn.k_proj"
prefix, _ = state_dict_key.rsplit(".", 1)
gptq_layers.add(prefix)
new_state_dict = {}
for state_dict_key in non_gptq_params:
new_state_dict[state_dict_key] = f.get_tensor(state_dict_key)
gptq_layers = sorted(gptq_layers)
max_layer_name_length = len(max(gptq_layers, key=len))
pbar = tqdm(gptq_layers)
i = 0
for gptq_layer_name in pbar:
i += 1
desc = f"Unpacking {gptq_layer_name} + '...'"
desc = desc + " " * (max_layer_name_length - len(desc))
awq_qweight = f.get_tensor(gptq_layer_name + ".qweight")
awq_qzeros = f.get_tensor(gptq_layer_name + ".qzeros")
awq_scales = f.get_tensor(gptq_layer_name + ".scales")
# TODO: add FAST unpacking.
unpacked_qweight, unpacked_qzeros = unpack_awq(
awq_qweight,
awq_qzeros,
awq_scales,
bits=quantize_config.bits,
group_size=quantize_config.group_size,
)
# TODO: add FAST repacking, this is too slow.
desc = f"Repacking {gptq_layer_name}..."
desc = desc + " " * (max_layer_name_length + 12 - len(desc))
pbar.set_description(desc)
gptq_qweight, gptq_qzeros = pack_from_tensors(
unpacked_qweight,
unpacked_qzeros,
awq_scales,
bits=quantize_config.bits,
group_size=quantize_config.group_size,
)
new_state_dict[gptq_layer_name + ".qweight"] = gptq_qweight
new_state_dict[gptq_layer_name + ".qzeros"] = gptq_qzeros
new_state_dict[gptq_layer_name + ".scales"] = awq_scales
safe_save(new_state_dict, model_cache_name)
model_save_name = model_cache_name
if use_marlin:
if is_sharded:
raise ValueError("The loading of sharded checkpoints with Marlin is currently not supported. Please raise an issue in AutoGPTQ repository.")
if torch.version.hip:
raise ValueError("Can not use Marlin int4*fp16 kernel with AMD ROCm version of PyTorch as the kernel is not compatible. Please do not use `use_marlin=True` when using ROCm devices.")
if not _validate_marlin_device_support():
raise ValueError(f'Can not use Marlin int4*fp16 kernel with a device of compute capability {torch.cuda.get_device_capability()}, the minimum compute capability is 8.0 for Marlin kernel. Please do not use `use_marlin=True`, or please upgrade your GPU ("The more you buy, the more you save." - Taiwanese proverb).')
# Validate the model can run in Marlin.
if torch_dtype != torch.float16:
raise ValueError("Marlin kernel requires torch_dtype=torch.float16.")
unsupported_reason = _validate_marlin_compatibility(quantize_config)
if unsupported_reason is not None:
raise ValueError(
f"The model {model_name_or_path} can not be converted to use the Marlin kernel for the following reason: {unsupported_reason}, which is not supported by Marlin kernel."
)
# Load the quant linear type we need.
# TODO: load directy marlin with the right quantlinear class.
quant_linear_class = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=quantize_config.desc_act,
group_size=quantize_config.group_size,
bits=quantize_config.bits,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_marlin=False,
use_tritonv2=use_tritonv2, # Get the "original" QuantLienar class
)
# Prepare model for marlin load.
# If stub is marlin serialzed --> load from directly
# If stub has cached marlin version --> load from the cached versin
# Otherwise --> convert to marlin, cache, load from cache
model, model_save_name = prepare_model_for_marlin_load(
model=model,
quantize_config=quantize_config,
quant_linear_class=quant_linear_class,
torch_dtype=torch_dtype,
current_model_save_name=model_save_name,
device_map=device_map,
)
# Disable incompatible optimizations.
if inject_fused_attention or inject_fused_mlp:
# TODO: Validate whether that can be used.
logger.info("Disabling fused attention and mlp injection because Marlin kernel is used.")
inject_fused_attention = False
inject_fused_mlp = False
load_checkpoint_in_model(
model,
dtype=torch_dtype, # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
checkpoint=model_save_name,
device_map=device_map,
offload_state_dict=True,
offload_buffers=True,
)
# TODO: Why are we using this custom function and not dispatch_model?
model = simple_dispatch_model(model, device_map)
else:
# Using QiGen.
if is_sharded:
raise ValueError("The loading of sharded checkpoints with QiGen is currently not supported. Please raise an issue in AutoGPTQ repository.")
if quantize_config.desc_act:
NotImplementedError("desc_act=True is not yet supported with QiGen.")
model = AutoModelForCausalLM.from_config(
config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
)
layers = find_layers(model)
ignore_layers = [cls.lm_head_name] + cls.outside_layer_modules
for name in list(layers.keys()):
if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers):
logger.info(f"{name} not been quantized, will be ignored when make_quant.")
del layers[name]
if model_save_name.endswith(".safetensors"):
checkpoint = safe_load(model_save_name)
else:
checkpoint = torch.load(model_save_name)
make_quant(
model,
layers,
quantize_config.bits,
quantize_config.group_size,
use_triton=use_triton,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_cuda_fp16=use_cuda_fp16,
desc_act=quantize_config.desc_act,
trainable=trainable,
use_qigen=True,
use_tritonv2=use_tritonv2,
use_marlin=quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN,
)
preprocess_checkpoint_qigen(
model,
layers,
quantize_config.bits,
quantize_config.group_size,
checkpoint,
)
model.load_state_dict(checkpoint)
# == step4: set seqlen == #
model_config = model.config.to_dict()
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
if any(k in model_config for k in seq_len_keys):
for key in seq_len_keys:
if key in model_config:
model.seqlen = model_config[key]
break
else:
logger.warning("can't get model's sequence length from model config, will set to 4096.")
model.seqlen = 4096
# == step5: (optional) inject optimized module == #
if inject_fused_attention:
if cls.fused_attn_module_type is None:
inject_fused_attention = False
logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
else:
cls.fused_attn_module_type.inject_to_model(
model,
use_triton=use_triton,
group_size=quantize_config.group_size,
use_cuda_fp16=use_cuda_fp16,
desc_act=quantize_config.desc_act,
trainable=trainable,
bits=quantize_config.bits,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_tritonv2=use_tritonv2,
)
if inject_fused_mlp:
if cls.fused_mlp_module_type is None:
inject_fused_mlp = False
logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
else:
cls.fused_mlp_module_type.inject_to_model(model, use_triton=use_triton)
# Any post-initialization that require device information, for example buffers initialization on device.
model = autogptq_post_init(model, use_act_order=quantize_config.desc_act)
model.eval()
# == step6: (optional) warmup triton == #
if (use_triton or use_tritonv2) and warmup_triton:
if use_tritonv2:
from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
else:
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
QuantLinear.warmup(model, seqlen=model.seqlen)
if inject_fused_mlp and cls.fused_mlp_module_type is not None:
cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
# == step7: make model compatible with peft
# cls.make_sure_compatible_with_peft(
# model,
# use_triton,
# quantize_config.desc_act,
# quantize_config.group_size,
# bits=quantize_config.bits,
# disable_exllama=disable_exllama,
# disable_exllamav2=disable_exllamav2,
# use_marlin=use_marlin,
# use_qigen=use_qigen,
# )
return cls(
model,
True,
quantize_config,
is_triton_backend=use_triton or use_tritonv2,
injected_fused_attention=inject_fused_attention,
injected_fused_mlp=inject_fused_mlp and (use_triton or use_tritonv2),
trainable=trainable,
)
def warmup_triton(self, enabled: bool = True):
if not enabled:
return
if not TRITON_AVAILABLE:
logger.warning("triton is not available, skip warmup stage directly.")
return
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
if self.fused_mlp_module_type is not None:
self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
def enable_trainable_mode(self, enabled: bool = True):
if not self.is_triton_backend and enabled:
raise NotImplementedError("For now, trainable mode only supports triton backend.")
for n, m in self.model.named_modules():
if hasattr(m, "trainable"):
setattr(m, "trainable", enabled)
def disable_trainable_mode(self):
self.enable_trainable_mode(enabled=False)
@staticmethod
def make_sure_compatible_with_peft(
model: PreTrainedModel,
use_triton: bool,
desc_act: bool,
group_size: int,
bits: int,
disable_exllama: bool = True,
disable_exllamav2: bool = False,
use_marlin: bool = False,
use_qigen: bool = False,
use_tritonv2: bool = False,
):
GeneralQuantLinear.inject_to_model(
model,
dynamically_import_QuantLinear(use_triton, desc_act, group_size, bits=bits, disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_marlin=use_marlin, use_qigen=use_qigen),
)
def __getattr__(self, item):
try:
return super().__getattr__(item)
except Exception:
return getattr(self.model, item)
__all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
from torch import device
from ..utils.import_utils import compare_transformers_version
CPU = device("cpu")
CUDA_0 = device("cuda:0")
SUPPORTED_MODELS = [
"bloom",
"gptj",
"gpt2",
"gpt_neox",
"opt",
"moss",
"gpt_bigcode",
"codegen",
"RefinedWebModel",
"RefinedWeb",
"baichuan",
"internlm",
"qwen",
"xverse",
"deci",
"stablelm_epoch",
"mpt",
"cohere",
]
if compare_transformers_version("v4.28.0", op="ge"):
SUPPORTED_MODELS.append("llama")
if compare_transformers_version("v4.30.0", op="ge"):
SUPPORTED_MODELS.append("longllama")
if compare_transformers_version("v4.33.0", op="ge"):
SUPPORTED_MODELS.append("falcon")
if compare_transformers_version("v4.34.0", op="ge"):
SUPPORTED_MODELS.append("mistral")
SUPPORTED_MODELS.append("Yi")
if compare_transformers_version("v4.36.0", op="ge"):
SUPPORTED_MODELS.append("mixtral")
if compare_transformers_version("v4.37.0", op="ge"):
SUPPORTED_MODELS.append("qwen2")
SUPPORTED_MODELS.append("phi")
if compare_transformers_version("v4.38.0", op="ge"):
SUPPORTED_MODELS.append("gemma")
if compare_transformers_version("v4.39.0.dev0", op="ge"):
SUPPORTED_MODELS.append("starcoder2")
if compare_transformers_version("v4.43.0.dev0", op="ge"):
SUPPORTED_MODELS.append("gemma2")
EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
__all__ = ["CPU", "CUDA_0", "SUPPORTED_MODELS", "EXLLAMA_DEFAULT_MAX_INPUT_LENGTH"]
import json
import logging
import os
from logging import getLogger
from typing import List, Optional, Union
import accelerate
import numpy as np
import torch
import torch.nn as nn
import transformers
import threadpoolctl as tctl
from tqdm import tqdm
from transformers import AutoConfig
from transformers.utils.hub import cached_file
from ..utils.import_utils import dynamically_import_QuantLinear
from ..utils.modeling_utils import recurse_setattr
from ._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, SUPPORTED_MODELS
logger = getLogger(__name__)
handler = logging.StreamHandler()
formatter = logging.Formatter("%(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
def get_device(obj: Union[torch.Tensor, nn.Module]):
if isinstance(obj, torch.Tensor):
return obj.device
return next(obj.parameters()).device
def move_to_device(obj: Optional[Union[torch.Tensor, nn.Module]], device: torch.device):
if obj is None:
return obj
else:
if get_device(obj) != device:
obj = obj.to(device)
return obj
def find_layers(module, layers=None, name=""):
if not layers:
layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
for layer in layers:
if isinstance(module, layer):
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
return res
def get_module_by_name_prefix(model, module_name: str):
for name, module in model.named_modules():
if name.startswith(module_name):
return module
def get_module_by_name_suffix(model, module_name: str):
for name, module in model.named_modules():
if name.endswith(module_name):
return module
def make_quant(
module,
names,
bits,
group_size,
name="",
use_triton: bool = False,
use_marlin: bool = False,
disable_exllama: Optional[bool] = None,
disable_exllamav2: bool = False,
use_qigen: bool = False,
use_cuda_fp16: bool = True,
desc_act: bool = False,
trainable: bool = False,
use_tritonv2: bool = False,
):
# If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
if disable_exllama is None:
if disable_exllamav2:
disable_exllama = False
else:
disable_exllama = True
QuantLinear = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=desc_act,
group_size=group_size,
bits=bits,
use_marlin=use_marlin,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_qigen=use_qigen,
use_tritonv2=use_tritonv2,
)
if isinstance(module, QuantLinear):
return
for name, submodule in module.named_modules():
if name in names:
ori_layer_device = next(submodule.parameters()).device
if isinstance(submodule, nn.Linear):
in_features = submodule.in_features
out_features = submodule.out_features
elif isinstance(submodule, nn.Conv2d):
in_features = submodule.in_channels
out_features = submodule.out_channels
elif isinstance(submodule, transformers.pytorch_utils.Conv1D):
in_features = submodule.weight.shape[0]
out_features = submodule.weight.shape[1]
bias = submodule.bias is not None
if (
(not (desc_act) or group_size == -1)
and not use_triton
and not use_qigen
and not use_tritonv2
):
new_layer = QuantLinear(
bits,
group_size,
in_features,
out_features,
bias,
use_cuda_fp16=use_cuda_fp16,
trainable=trainable,
weight_dtype=submodule.weight.dtype,
)
else:
new_layer = QuantLinear(
bits,
group_size,
in_features,
out_features,
bias,
trainable=trainable,
weight_dtype=submodule.weight.dtype,
)
new_layer.device = ori_layer_device
recurse_setattr(module, name, new_layer.to(ori_layer_device))
def preprocess_checkpoint_qigen(
module,
names,
bits,
group_size,
checkpoint,
name="",
):
try:
import cQIGen as qinfer
except ImportError:
logger.error("cQIGen not installed.")
raise
QuantLinear = dynamically_import_QuantLinear(
use_triton=False,
desc_act=False,
group_size=group_size,
bits=bits,
disable_exllama=False,
use_qigen=True,
)
if isinstance(module, QuantLinear):
in_features = module.infeatures
out_features = module.outfeatures
zeros = checkpoint[name + ".qzeros"]
scales = checkpoint[name + ".scales"].float()
if zeros.dtype != torch.float32:
new_zeros = torch.zeros_like(scales).float().contiguous()
if bits == 4:
qinfer.unpack_zeros4(zeros, new_zeros, new_zeros.shape[0], new_zeros.shape[1])
elif bits == 2:
qinfer.unpack_zeros2(zeros, new_zeros, new_zeros.shape[0], new_zeros.shape[1])
elif bits == 3:
logger.info("Unpacking zeros for 3 bits")
new_scales = scales.contiguous()
else:
if scales.shape[1] != out_features:
new_scales = scales.transpose(0, 1).contiguous()
else:
new_scales = scales.contiguous()
if zeros.shape[1] != out_features:
new_zeros = zeros.transpose(0, 1).contiguous()
else:
new_zeros = zeros.contiguous()
checkpoint[name + ".zeros"], checkpoint[name + ".scales"] = (
new_zeros,
new_scales,
)
del checkpoint[name + ".qzeros"]
del checkpoint[name + ".g_idx"]
if name + ".bias" in checkpoint:
checkpoint[name + ".bias"] = checkpoint[name + ".bias"].float()
else:
checkpoint[name + ".bias"] = torch.zeros(out_features)
checkpoint_qweight = checkpoint[name + ".qweight"].int().contiguous()
if bits == 4:
qweight = torch.zeros(int(in_features // 8 * out_features)).int().contiguous()
qinfer.pack4(
checkpoint_qweight,
qweight,
in_features // 8,
out_features,
module.mb,
module.tb,
module.cutoff,
) # * (module.tt//tb))
elif bits == 3:
qweight = torch.zeros(int(in_features // 32 * 3 * out_features)).int().contiguous()
qinfer.pack3(
checkpoint_qweight,
qweight,
in_features // 32 * 3,
out_features,
module.mb // 32 * 3,
module.tb,
module.cutoff,
)
elif bits == 2:
qweight = torch.zeros(int(in_features // 16 * out_features)).int().contiguous()
qinfer.pack2(
checkpoint_qweight,
qweight,
in_features // 16,
out_features,
module.mb,
module.tb,
module.cutoff,
) # * (module.tt//tb))
checkpoint[name + ".qweight"] = qweight
return
for name1, child in module.named_children():
preprocess_checkpoint_qigen(
child,
names,
bits,
group_size,
checkpoint,
name + "." + name1 if name != "" else name1,
)
def pack_model(
model,
quantizers,
bits,
group_size,
use_triton=False,
use_cuda_fp16=True,
desc_act=False,
warmup_triton: bool = False,
force_layer_back_to_cpu: bool = False,
use_marlin: bool = False,
use_tritonv2: bool = False,
):
# set excllama excllamav2
disable_ex=True,
disable_exv2=False
QuantLinear = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=desc_act,
group_size=group_size,
bits=bits,
disable_exllama=disable_ex,
disable_exllamav2=disable_exv2,
use_marlin=use_marlin,
use_tritonv2=use_tritonv2,
)
if force_layer_back_to_cpu:
model.to(CPU)
logger.info("Packing model...")
layers = find_layers(model)
layers = {n: layers[n] for n in quantizers}
make_quant(
model,
quantizers,
bits,
group_size,
use_triton=use_triton,
use_cuda_fp16=use_cuda_fp16,
desc_act=desc_act,
disable_exllama=disable_ex,
disable_exllamav2=disable_exv2,
use_marlin=use_marlin,
)
qlayers = find_layers(model, [QuantLinear])
# TODO remove once pack() thread regression is fixed
# Limit pack() thread usage to avoid slow-down: applies limit to all supported libs
with tctl.threadpool_limits(limits=1):
pbar = tqdm(qlayers.keys(), leave=True)
for name in pbar:
pbar.set_description(f"Packing {name}...", refresh=True)
quantizers[name], scale, zero, g_idx = quantizers[name]
# so far can only pack layer on CPU
layer_device = qlayers[name].device
qlayers[name].to(CPU)
layers[name], scale, zero, g_idx = (
layers[name].to(CPU),
scale.to(CPU),
zero.to(CPU),
g_idx.to(CPU),
)
if QuantLinear.QUANT_TYPE == "marlin":
qlayers[name].pack(layers[name], scale)
else:
qlayers[name].pack(layers[name], scale, zero, g_idx)
qlayers[name].to(layer_device)
logger.info("Model packed.")
if use_triton and warmup_triton:
logger.warning(
"using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model."
)
QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen)
def check_and_get_model_type(model_dir, trust_remote_code=False):
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
if config.model_type not in SUPPORTED_MODELS:
raise TypeError(f"{config.model_type} isn't supported yet.")
model_type = config.model_type
return model_type
def simple_dispatch_model(model, device_map):
from accelerate.hooks import AlignDevicesHook, add_hook_to_module
if "" in device_map:
d = device_map[""]
model = model.to(torch.device(d))
model.hf_device_map = device_map
return model
tied_params = accelerate.utils.modeling.find_tied_parameters(model)
if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {
"cpu",
"disk",
}:
main_device = "cpu"
else:
main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"]
prev_hook = None
for idx, (n, d) in enumerate(cpu_offload_group):
m = get_module_by_name_suffix(model, n)
_, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook)
# set first cpu offload module's prev_module_hook to the last cpu offload module's hook
if len(cpu_offload_group) > 1:
get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
for n, d in device_map.items():
m = get_module_by_name_suffix(model, n)
if d != "cpu":
d = torch.device(d)
hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True)
add_hook_to_module(m, hook)
accelerate.utils.modeling.retie_parameters(model, tied_params)
model.hf_device_map = device_map
return model
def autogptq_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None):
"""
The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state.
"""
device_to_buffers_size = {}
model_uses_exllama = False
for name, submodule in model.named_modules():
if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllama":
model_uses_exllama = True
device = submodule.qweight.device
if device not in device_to_buffers_size:
device_to_buffers_size[device] = {
"max_dq_buffer_size": 1,
"max_inner_outer_dim": 1,
}
if not use_act_order:
submodule._use_act_order = False
else:
submodule._use_act_order = True
# Disable this heuristic for detecting act_order, but it could be used instead of the config.
"""
if submodule.g_idx is None:
submodule.act_order = False
elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))):
submodule.g_idx = None
submodule.act_order = False
else:
submodule.act_order = True
"""
device_to_buffers_size[device]["max_dq_buffer_size"] = max(
device_to_buffers_size[device]["max_dq_buffer_size"],
submodule.qweight.numel() * 8,
)
if use_act_order:
device_to_buffers_size[device]["max_inner_outer_dim"] = max(
device_to_buffers_size[device]["max_inner_outer_dim"],
submodule.infeatures,
submodule.outfeatures,
)
if model_uses_exllama:
# To be honest this is quite ugly, not proud of this.
try:
from exllama_kernels import prepare_buffers, set_tuning_params
except ImportError as e:
raise ImportError(
f"Could not import exllama backend dependencies prepare_buffers, set_tuning_params with the following error: {e}"
)
device_to_buffers = {}
if use_act_order:
if max_input_length is None:
max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
else:
max_input_len = max_input_length
else:
if max_input_length is not None:
logger.info(
"Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored."
)
max_input_len = 1
for device, buffers_size in device_to_buffers_size.items():
# The temp_state buffer is required to reorder X in the act-order case.
# The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
device_to_buffers[device] = {
"temp_state": torch.zeros(
(max_input_len, buffers_size["max_inner_outer_dim"]),
dtype=torch.float16,
device=device,
),
"temp_dq": torch.zeros(
(1, buffers_size["max_dq_buffer_size"]),
dtype=torch.float16,
device=device,
),
"max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
"max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
}
# Buffers need to be persistent to avoid any bug.
model.device_to_buffers = device_to_buffers
for device, buffers in model.device_to_buffers.items():
prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
# Using the default from exllama repo here.
matmul_recons_thd = 8
matmul_fused_remap = False
matmul_no_half2 = False
set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
# The buffers need to have been initialized first before calling make_q4.
for name, submodule in model.named_modules():
if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllama":
submodule.post_init()
## exllamav2
fixed_bytes = {}
model_uses_exllamav2 = False
for _, submodule in model.named_modules():
if hasattr(submodule, "QUANT_TYPE"):
if submodule.QUANT_TYPE == "exllamav2":
model_uses_exllamav2 = True
device = submodule.qweight.device
scratch_fixed = submodule.scratch_space_fixed()
fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
elif submodule.QUANT_TYPE == "hpu":
submodule.post_init()
if model_uses_exllamav2:
from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors
device_tensors = {}
for device, scratch_bytes in fixed_bytes.items():
device_tensors[device] = ExLlamaV2DeviceTensors(device.index, scratch_bytes)
# have persistent buffers, otherwise we will get OOM
model.device_tensors = device_tensors
for _, submodule in model.named_modules():
if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllamav2":
device = submodule.qweight.device
submodule.post_init(temp_dq=model.device_tensors[device])
torch.cuda.empty_cache()
return model
def make_sure_no_tensor_in_meta_device(
model, use_triton: bool, desc_act: bool, group_size: int, bits: int, disable_exllama: bool, disable_exllamav2: bool, use_marlin: bool = False, use_tritonv2: bool = False,
):
QuantLinear = dynamically_import_QuantLinear(use_triton, desc_act, group_size, bits=bits, disable_exllama=disable_exllama, disable_exllamav2=disable_exllamav2, use_marlin=use_marlin, use_tritonv2=use_tritonv2)
for n, m in model.named_modules():
if isinstance(m, QuantLinear) and m.bias is not None and m.bias.device == torch.device("meta"):
m.register_buffer("bias", torch.zeros((m.outfeatures), dtype=torch.float16, device="cpu"))
def awq_reverse_reorder_int_tensor(int_tensor, bits: int):
assert bits == 4
int_tensor = int_tensor.T.contiguous()
compress_ratio = 32 // bits
assert int_tensor.shape[-1] % compress_ratio == 0
order_map = [0, 2, 4, 6, 1, 3, 5, 7]
order_tensor = torch.tensor(order_map, dtype=torch.int32, device=int_tensor.device).reshape(1, -1)
order_tensor = order_tensor.repeat(int_tensor.shape[1] // compress_ratio, 1)
order_tensor = order_tensor + torch.arange(
0,
int_tensor.shape[1],
compress_ratio,
dtype=torch.int32,
device=int_tensor.device,
).reshape(-1, 1)
order_tensor = order_tensor.reshape(-1)
reverse_order_tensor = torch.arange(order_tensor.shape[0]).cuda()[order_tensor]
reverse_order_tensor = reverse_order_tensor[order_tensor]
int_tensor = int_tensor[:, reverse_order_tensor]
return int_tensor
def unpack_awq(
awq_qweight: torch.Tensor,
awq_qzeros: torch.Tensor,
awq_scales: torch.Tensor,
bits: int,
group_size: int,
):
"""
Args:
awq_qweight (`torch.LongTensor`):
Expected shape: (in_features, out_features // (32 // bits))
awq_qzeros (`torch.LongTensor`):
Expected shape: (in_features // group_size, out_features // (32 // bits))
awq_scales (`torch.LongTensor`):
Expected shape: (in_features // group_size, out_features)
Returns:
fp16_weight (`torch.LongTensor`):
With shape (in_features, out_features).
zeros (`torch.LongTensor`):
With shape (in_features // group_size, out_features).
"""
assert bits == 4
qzeros = awq_qzeros.cuda()
qweight = awq_qweight.cuda()
qweight = qweight.T.contiguous()
infeatures = awq_qweight.shape[0]
wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32, device=qzeros.device).unsqueeze(0)
zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2), wf.unsqueeze(0)).to(
torch.int16 if bits == 8 else torch.int8
)
# zeros = zeros + 1
torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1), wf.unsqueeze(-1)).to(
torch.int16 if bits == 8 else torch.int8
)
torch.bitwise_and(weight, (2**bits) - 1, out=weight)
weight = weight.reshape(-1, group_size, weight.shape[2])
weight = weight.view(-1, weight.shape[-1])
zeros = zeros.view(-1, zeros.shape[-1])
zeros = zeros.T.contiguous()
zeros = awq_reverse_reorder_int_tensor(zeros, bits)
weight = awq_reverse_reorder_int_tensor(weight, bits)
# Dequantize weights.
scales = awq_scales.cuda()
zeros = zeros.contiguous()
scale_zeros = zeros * scales
g_idx = torch.tensor([i // group_size for i in range(infeatures)], dtype=torch.int32)
scale_mat = scales[g_idx]
scale_zeros_mat = scale_zeros[g_idx].half()
qdq_weight_T = weight * scale_mat - scale_zeros_mat.half()
fp16_weight = qdq_weight_T.T.cuda()
return fp16_weight, zeros
def pack_from_tensors(
unpacked_qweight: torch.Tensor,
unpacked_qzeros: torch.Tensor,
awq_scales: torch.Tensor,
bits: int,
group_size: int,
):
"""
Args:
unpacked_qweight (`torch.LongTensor`):
Expected shape: (in_features, out_features)
unpacked_qzeros (`torch.LongTensor`):
Expected shape: (in_features // group_size, out_features)
awq_scales (`torch.LongTensor`):
Expected shape: (in_features // group_size, out_features)
Returns:
qweight (`torch.LongTensor`):
With shape (in_features // (32 // bits), out_features)
qzeros (`torch.LongTensor`):
With shape (in_features // group_size, out_features // (32 // bits))
"""
assert bits == 4
W = unpacked_qweight.clone().cpu()
# TODO: This should be checked somehow.
# if isinstance(linear, nn.Conv2d):
# W = W.flatten(1)
# if isinstance(linear, transformers.pytorch_utils.Conv1D):
# W = W.t()
awq_scales = awq_scales.t().contiguous()
unpacked_qzeros = unpacked_qzeros.contiguous()
unpacked_qzeros = unpacked_qzeros.cpu()
awq_scales = awq_scales.cpu()
scale_zeros = unpacked_qzeros.t() * awq_scales
scales = awq_scales.clone()
infeatures = unpacked_qweight.shape[1]
intweight = []
for idx in range(infeatures):
g_idx = idx // group_size
intweight.append(torch.round((W[:, idx] + scale_zeros[:, g_idx]) / scales[:, g_idx]).to(torch.int)[:, None])
intweight = torch.cat(intweight, dim=1)
intweight = intweight.t().contiguous()
intweight = intweight.numpy().astype(np.uint32)
i = 0
row = 0
qweight = np.zeros((intweight.shape[0] // 32 * bits, intweight.shape[1]), dtype=np.uint32)
while row < qweight.shape[0]:
for j in range(i, i + (32 // bits)):
qweight[row] |= intweight[j] << (bits * (j - i))
i += 32 // bits
row += 1
qweight = qweight.astype(np.int32)
qweight = torch.from_numpy(qweight)
unpacked_qzeros = unpacked_qzeros - 1
torch.bitwise_and(unpacked_qzeros, (2**bits) - 1, out=unpacked_qzeros)
unpacked_qzeros = unpacked_qzeros.numpy().astype(np.uint32)
qzeros = np.zeros(
(unpacked_qzeros.shape[0], unpacked_qzeros.shape[1] // 32 * bits),
dtype=np.uint32,
)
i = 0
col = 0
while col < qzeros.shape[1]:
for j in range(i, i + (32 // bits)):
qzeros[:, col] |= unpacked_qzeros[:, j] << (bits * (j - i))
i += 32 // bits
col += 1
qzeros = qzeros.astype(np.int32)
qzeros = torch.from_numpy(qzeros)
return qweight, qzeros
def get_checkpoints(model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs):
"""
Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`).
"""
searched_files = []
resolved_archive_file = None
true_model_basename = None
if os.path.isdir(model_name_or_path):
for ext in extensions:
for possible_model_basename in possible_model_basenames:
shard_index_name = possible_model_basename + ext + ".index.json"
searched_files.append(shard_index_name)
possible_index_file = os.path.join(model_name_or_path, shard_index_name)
if os.path.isfile(possible_index_file):
# The model is sharded over several checkpoints.
possible_model_basename = possible_index_file.replace(ext + ".index.json", "")
return True, possible_index_file, possible_model_basename
else:
model_save_name = os.path.join(model_name_or_path, possible_model_basename)
searched_files.append(possible_model_basename + ext)
if os.path.isfile(model_save_name + ext):
resolved_archive_file = model_save_name + ext
return False, resolved_archive_file, possible_model_basename
else:
temp = None
for ext in extensions:
for possible_model_basename in possible_model_basenames:
shard_index_name = possible_model_basename + ext + ".index.json"
shard_index = cached_file(
model_name_or_path,
shard_index_name,
**cached_file_kwargs,
)
searched_files.append(shard_index_name)
if shard_index is not None:
# The model is sharded over several checkpoints.
with open(str(shard_index)) as f:
index_json = json.load(f)
# Download the shards from the index.json.
shards = list(set(index_json["weight_map"].values()))
for shard in shards:
resolved_archive_file = cached_file(
model_name_or_path,
shard,
**cached_file_kwargs,
)
return True, shard_index, possible_model_basename
else:
resolved_archive_file = cached_file(
model_name_or_path,
possible_model_basename + ext,
**cached_file_kwargs,
)
if resolved_archive_file is None:
resolved_archive_file = temp
searched_files.append(possible_model_basename + ext)
if resolved_archive_file is not None:
temp = resolved_archive_file
return False, resolved_archive_file, possible_model_basename
if resolved_archive_file is None:
raise FileNotFoundError(
f"Could not find a model in {model_name_or_path} with a name in {', '.join(searched_files)}. Please specify the argument model_basename to use a custom file name."
)
return False, resolved_archive_file, true_model_basename
__all__ = [
"get_device",
"move_to_device",
"find_layers",
"get_module_by_name_prefix",
"get_module_by_name_suffix",
"make_quant",
"preprocess_checkpoint_qigen",
"pack_model",
"autogptq_post_init",
"check_and_get_model_type",
"simple_dispatch_model",
"make_sure_no_tensor_in_meta_device",
]
from inspect import signature
from typing import Dict, Optional, Union
from ._base import BaseGPTQForCausalLM, BaseQuantizeConfig
from ._utils import check_and_get_model_type
from .baichuan import BaiChuanGPTQForCausalLM
from .bloom import BloomGPTQForCausalLM
from .codegen import CodeGenGPTQForCausalLM
from .cohere import CohereGPTQForCausalLM
from .decilm import DeciLMGPTQForCausalLM
from .gemma import GemmaGPTQForCausalLM
from .gemma2 import Gemma2GPTQForCausalLM
from .gpt2 import GPT2GPTQForCausalLM
from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
from .gpt_neox import GPTNeoXGPTQForCausalLM
from .gptj import GPTJGPTQForCausalLM
from .internlm import InternLMGPTQForCausalLM
from .llama import LlamaGPTQForCausalLM
from .longllama import LongLlamaGPTQForCausalLM
from .mistral import MistralGPTQForCausalLM
from .mixtral import MixtralGPTQForCausalLM
from .moss import MOSSGPTQForCausalLM
from .mpt import MPTGPTQForCausalLM
from .opt import OPTGPTQForCausalLM
from .phi import PhiGPTQForCausalLM
from .qwen import QwenGPTQForCausalLM
from .qwen2 import Qwen2GPTQForCausalLM
from .rw import RWGPTQForCausalLM
from .stablelmepoch import StableLMEpochGPTQForCausalLM
from .starcoder2 import Starcoder2GPTQForCausalLM
from .xverse import XverseGPTQForCausalLM
from .yi import YiGPTQForCausalLM
GPTQ_CAUSAL_LM_MODEL_MAP = {
"bloom": BloomGPTQForCausalLM,
"gpt_neox": GPTNeoXGPTQForCausalLM,
"gptj": GPTJGPTQForCausalLM,
"gpt2": GPT2GPTQForCausalLM,
"llama": LlamaGPTQForCausalLM,
"opt": OPTGPTQForCausalLM,
"moss": MOSSGPTQForCausalLM,
"gpt_bigcode": GPTBigCodeGPTQForCausalLM,
"codegen": CodeGenGPTQForCausalLM,
"cohere": CohereGPTQForCausalLM,
"RefinedWebModel": RWGPTQForCausalLM,
"RefinedWeb": RWGPTQForCausalLM,
"falcon": RWGPTQForCausalLM,
"baichuan": BaiChuanGPTQForCausalLM,
"internlm": InternLMGPTQForCausalLM,
"qwen": QwenGPTQForCausalLM,
"mistral": MistralGPTQForCausalLM,
"Yi": YiGPTQForCausalLM,
"xverse": XverseGPTQForCausalLM,
"deci": DeciLMGPTQForCausalLM,
"stablelm_epoch": StableLMEpochGPTQForCausalLM,
"starcoder2": Starcoder2GPTQForCausalLM,
"mixtral": MixtralGPTQForCausalLM,
"qwen2": Qwen2GPTQForCausalLM,
"longllama": LongLlamaGPTQForCausalLM,
"gemma": GemmaGPTQForCausalLM,
"gemma2": Gemma2GPTQForCausalLM,
"phi": PhiGPTQForCausalLM,
"mpt": MPTGPTQForCausalLM,
}
class AutoGPTQForCausalLM:
def __init__(self):
raise EnvironmentError(
"AutoGPTQModelForCausalLM is designed to be instantiated\n"
"using `AutoGPTQModelForCausalLM.from_pretrained` if want to quantize a pretrained model.\n"
"using `AutoGPTQModelForCausalLM.from_quantized` if want to inference with quantized model."
)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
quantize_config: BaseQuantizeConfig,
max_memory: Optional[dict] = None,
trust_remote_code: bool = False,
**model_init_kwargs,
) -> BaseGPTQForCausalLM:
model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
quantize_config=quantize_config,
max_memory=max_memory,
trust_remote_code=trust_remote_code,
**model_init_kwargs,
)
@classmethod
def from_quantized(
cls,
model_name_or_path: Optional[str],
device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
max_memory: Optional[dict] = None,
device: Optional[Union[str, int]] = None,
low_cpu_mem_usage: bool = False,
use_triton: bool = False,
inject_fused_attention: bool = False,
inject_fused_mlp: bool = False,
use_cuda_fp16: bool = True,
quantize_config: Optional[BaseQuantizeConfig] = None,
model_basename: Optional[str] = None,
use_safetensors: bool = True,
trust_remote_code: bool = False,
warmup_triton: bool = False,
trainable: bool = False,
disable_exllama: Optional[bool] = None,
disable_exllamav2: bool = False,
use_marlin: bool = False,
use_tritonv2: bool = False,
**kwargs,
) -> BaseGPTQForCausalLM:
# If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
if disable_exllama is None:
if disable_exllamav2:
disable_exllama = False
else:
disable_exllama = True
model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
# A static list of kwargs needed for huggingface_hub
huggingface_kwargs = [
"cache_dir",
"force_download",
"proxies",
"resume_download",
"local_files_only",
"use_auth_token",
"revision",
"subfolder",
"_raise_exceptions_for_missing_entries",
"_commit_hash",
]
# TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
keywords = {
key: kwargs[key]
for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
if key in kwargs
}
return quant_func(
model_name_or_path=model_name_or_path,
device_map=device_map,
max_memory=max_memory,
device=device,
low_cpu_mem_usage=low_cpu_mem_usage,
use_triton=use_triton,
inject_fused_attention=inject_fused_attention,
inject_fused_mlp=inject_fused_mlp,
use_cuda_fp16=use_cuda_fp16,
quantize_config=quantize_config,
model_basename=model_basename,
use_safetensors=use_safetensors,
trust_remote_code=trust_remote_code,
warmup_triton=warmup_triton,
trainable=trainable,
disable_exllama=disable_exllama,
disable_exllamav2=disable_exllamav2,
use_marlin=use_marlin,
use_tritonv2=use_tritonv2,
**keywords,
)
__all__ = ["AutoGPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "DecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.W_pack"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
__all__ = ["BaiChuanGPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class BloomGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "BloomBlock"
layers_block_name = "transformer.h"
outside_layer_modules = [
"transformer.word_embeddings",
"transformer.word_embeddings_layernorm",
"transformer.ln_f",
]
inside_layer_modules = [
["self_attention.query_key_value"],
["self_attention.dense"],
["mlp.dense_h_to_4h"],
["mlp.dense_4h_to_h"],
]
__all__ = ["BloomGPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class CodeGenGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "CodeGenBlock"
layers_block_name = "transformer.h"
outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
inside_layer_modules = [
["attn.qkv_proj"],
["attn.out_proj"],
["mlp.fc_in"],
["mlp.fc_out"],
]
__all__ = ["CodeGenGPTQForCausalLM"]
from logging import getLogger
from ._base import BaseGPTQForCausalLM
logger = getLogger(__name__)
class CohereGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "CohereDecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
__all__ = ["CohereGPTQForCausalLM"]
\ No newline at end of file
from logging import getLogger
from ..utils.import_utils import compare_transformers_version
from ._base import BaseGPTQForCausalLM
if compare_transformers_version("v4.28.0", op="ge"):
from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
else:
FusedLlamaAttentionForQuantizedModel = None
FusedLlamaMLPForQuantizedModel = None
logger = getLogger(__name__)
class DeciLMGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "DeciLMDecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
__all__ = ["DeciLMGPTQForCausalLM"]
from logging import getLogger
from ._base import BaseGPTQForCausalLM
logger = getLogger(__name__)
class GemmaGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "GemmaDecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
__all__ = ["GemmaGPTQForCausalLM"]
from logging import getLogger
from ._base import BaseGPTQForCausalLM
logger = getLogger(__name__)
class Gemma2GPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "Gemma2DecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
__all__ = ["Gemma2GPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class GPT2GPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "GPT2Block"
layers_block_name = "transformer.h"
outside_layer_modules = ["transformer.wte", "transformer.wpe", "transformer.ln_f"]
inside_layer_modules = [
["attn.c_attn"],
["attn.c_proj"],
["mlp.c_fc"],
["mlp.c_proj"],
]
__all__ = ["GPT2GPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class GPTBigCodeGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "GPTBigCodeBlock"
layers_block_name = "transformer.h"
outside_layer_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"]
inside_layer_modules = [
["attn.c_attn"],
["attn.c_proj"],
["mlp.c_fc"],
["mlp.c_proj"],
]
__all__ = ["GPTBigCodeGPTQForCausalLM"]
from ._base import BaseGPTQForCausalLM
class GPTNeoXGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "GPTNeoXLayer"
layers_block_name = "gpt_neox.layers"
outside_layer_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"]
inside_layer_modules = [
["attention.query_key_value"],
["attention.dense"],
["mlp.dense_h_to_4h"],
["mlp.dense_4h_to_h"],
]
lm_head_name = "embed_out"
__all__ = ["GPTNeoXGPTQForCausalLM"]
from ..nn_modules.fused_gptj_attn import FusedGPTJAttentionForQuantizedModel
from ._base import BaseGPTQForCausalLM
class GPTJGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "GPTJBlock"
layers_block_name = "transformer.h"
outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
inside_layer_modules = [
["attn.k_proj", "attn.v_proj", "attn.q_proj"],
["attn.out_proj"],
["mlp.fc_in"],
["mlp.fc_out"],
]
fused_attn_module_type = FusedGPTJAttentionForQuantizedModel
__all__ = ["GPTJGPTQForCausalLM"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment