"mmdet3d/models/vscode:/vscode.git/clone" did not exist on "663154526c592c4fd5e07f4eec3aa102cdaa0c8f"
Unverified Commit fe803c29 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #722 from jiqing-feng/4bit_double_quant

add bnb_4bit_use_double_quant and low_cpu_mem_usage
parents 4fbbd60f f5808609
...@@ -19,6 +19,7 @@ _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.dev ...@@ -19,6 +19,7 @@ _DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.dev
def _get_accelerate_args( def _get_accelerate_args(
low_cpu_mem_usage: Optional[bool] = True,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -38,6 +39,7 @@ def _get_accelerate_args( ...@@ -38,6 +39,7 @@ def _get_accelerate_args(
args = {} args = {}
if max_memory: if max_memory:
args["max_memory"] = max_memory args["max_memory"] = max_memory
args["low_cpu_mem_usage"] = low_cpu_mem_usage
args["device_map"] = device_map_option args["device_map"] = device_map_option
args["offload_folder"] = offload_folder args["offload_folder"] = offload_folder
return args return args
...@@ -80,6 +82,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -80,6 +82,7 @@ class HuggingFaceAutoLM(BaseLM):
max_length: Optional[int] = None, max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None, add_special_tokens: Optional[bool] = None,
use_accelerate: Optional[bool] = False, use_accelerate: Optional[bool] = False,
low_cpu_mem_usage: Optional[bool] = True,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -93,6 +96,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -93,6 +96,7 @@ class HuggingFaceAutoLM(BaseLM):
gptq_use_triton: Optional[bool] = False, gptq_use_triton: Optional[bool] = False,
bnb_4bit_quant_type: Optional[str] = None, bnb_4bit_quant_type: Optional[str] = None,
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None, bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
bnb_4bit_use_double_quant: Optional[bool] = False,
): ):
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation. """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
Args: Args:
...@@ -113,6 +117,8 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -113,6 +117,8 @@ class HuggingFaceAutoLM(BaseLM):
use_accelerate (bool, optional, defaults to False): use_accelerate (bool, optional, defaults to False):
If True, uses the `accelerate` library to load a large model across If True, uses the `accelerate` library to load a large model across
multiple devices. multiple devices.
low_cpu_mem_usage (bool, optional, defaults to True):
It True, uses the `accelerate` library to accelerate loading the model.
device_map_option (str, optional, defaults to "auto"): device_map_option (str, optional, defaults to "auto"):
The device map option to use when loading the model with The device map option to use when loading the model with
`accelerate`. `accelerate`.
...@@ -160,6 +166,9 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -160,6 +166,9 @@ class HuggingFaceAutoLM(BaseLM):
bnb_4bit_compute_dtype (Union[str, torch.dtype], optional, defaults to None): bnb_4bit_compute_dtype (Union[str, torch.dtype], optional, defaults to None):
The compute dtype to use for BnB 4bit quantization. See: The compute dtype to use for BnB 4bit quantization. See:
https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L74 https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L74
bnb_4bit_use_double_quant (bool, optional, defaults to False):
Whether or not to use double quant to quantize the absmax.
https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py#L80
""" """
super().__init__() super().__init__()
...@@ -210,6 +219,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -210,6 +219,7 @@ class HuggingFaceAutoLM(BaseLM):
model_kwargs = {} model_kwargs = {}
if use_accelerate: if use_accelerate:
model_kwargs = _get_accelerate_args( model_kwargs = _get_accelerate_args(
low_cpu_mem_usage,
device_map_option, device_map_option,
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
...@@ -227,6 +237,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -227,6 +237,7 @@ class HuggingFaceAutoLM(BaseLM):
load_in_4bit=load_in_4bit, load_in_4bit=load_in_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
**model_kwargs, **model_kwargs,
) )
# note: peft_path can be different than pretrained model path # note: peft_path can be different than pretrained model path
...@@ -260,6 +271,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -260,6 +271,7 @@ class HuggingFaceAutoLM(BaseLM):
quantized: Optional[Union[bool, str]] = False, quantized: Optional[Union[bool, str]] = False,
revision: str, revision: str,
subfolder: str, subfolder: str,
low_cpu_mem_usage: Optional[bool] = True,
device_map: Optional[Union[str, _DeviceMapping]] = None, device_map: Optional[Union[str, _DeviceMapping]] = None,
max_memory: Optional[dict] = None, max_memory: Optional[dict] = None,
offload_folder: Optional[str] = None, offload_folder: Optional[str] = None,
...@@ -270,6 +282,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -270,6 +282,7 @@ class HuggingFaceAutoLM(BaseLM):
gptq_use_triton: Optional[bool] = False, gptq_use_triton: Optional[bool] = False,
bnb_4bit_quant_type: Optional[str] = None, bnb_4bit_quant_type: Optional[str] = None,
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None, bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
bnb_4bit_use_double_quant: Optional[bool] = False,
) -> transformers.AutoModel: ) -> transformers.AutoModel:
"""Returns a pre-trained pytorch model from a pre-trained model configuration.""" """Returns a pre-trained pytorch model from a pre-trained model configuration."""
if not quantized: if not quantized:
...@@ -283,9 +296,12 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -283,9 +296,12 @@ class HuggingFaceAutoLM(BaseLM):
model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
if bnb_4bit_compute_dtype: if bnb_4bit_compute_dtype:
model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype) model_kwargs["bnb_4bit_compute_dtype"] = _get_dtype(bnb_4bit_compute_dtype)
if bnb_4bit_use_double_quant:
model_kwargs["bnb_4bit_use_double_quant"] = bnb_4bit_use_double_quant
model = self.AUTO_MODEL_CLASS.from_pretrained( model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""), revision=revision + ("/" + subfolder if subfolder is not None else ""),
low_cpu_mem_usage=low_cpu_mem_usage,
device_map=device_map, device_map=device_map,
max_memory=max_memory, max_memory=max_memory,
offload_folder=offload_folder, offload_folder=offload_folder,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment