"include/vscode:/vscode.git/clone" did not exist on "6eb55499234aafec721d71401171c144261a9893"
Unverified Commit ef665088 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Fix data-parallel evaluation with quantized models (#1270)

* add WIP device_map overrides

* update handling outside of accelerate launcher

* change .to(device) log to debug level

* run linter
parent 03e7df51
...@@ -133,6 +133,8 @@ class HFLM(LM): ...@@ -133,6 +133,8 @@ class HFLM(LM):
gpus = torch.cuda.device_count() gpus = torch.cuda.device_count()
accelerator = Accelerator() accelerator = Accelerator()
if accelerator.num_processes > 1:
self.accelerator = accelerator
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
...@@ -202,15 +204,16 @@ class HFLM(LM): ...@@ -202,15 +204,16 @@ class HFLM(LM):
self.model.tie_weights() self.model.tie_weights()
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"): if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
if not (parallelize or autogptq or ("device_map" in kwargs)): # TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually, # place model onto device requested manually,
# if not using HF Accelerate or device_map # if not using HF Accelerate or device_map
# or any other option that preloads model onto device # or any other option that preloads model onto device
try: try:
self.model.to(self.device) self.model.to(self.device)
except ValueError: except ValueError:
eval_logger.info( eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore." "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
) )
self._create_tokenizer( self._create_tokenizer(
...@@ -456,12 +459,24 @@ class HFLM(LM): ...@@ -456,12 +459,24 @@ class HFLM(LM):
if parallelize: if parallelize:
model_kwargs.update( model_kwargs.update(
_get_accelerate_args( _get_accelerate_args(
device_map_option, device_map_option, # TODO: phase out device_map_option?
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
offload_folder, offload_folder,
) )
) )
elif "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update(
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
if not autogptq: if not autogptq:
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
assert ( assert (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment