Unverified Commit ef665088 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Fix data-parallel evaluation with quantized models (#1270)

* add WIP device_map overrides

* update handling outside of accelerate launcher

* change .to(device) log to debug level

* run linter
parent 03e7df51
......@@ -133,6 +133,8 @@ class HFLM(LM):
gpus = torch.cuda.device_count()
accelerator = Accelerator()
if accelerator.num_processes > 1:
self.accelerator = accelerator
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
......@@ -202,15 +204,16 @@ class HFLM(LM):
self.model.tie_weights()
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
if not (parallelize or autogptq or ("device_map" in kwargs)):
# TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
# or any other option that preloads model onto device
try:
self.model.to(self.device)
except ValueError:
eval_logger.info(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
eval_logger.debug(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
)
self._create_tokenizer(
......@@ -456,12 +459,24 @@ class HFLM(LM):
if parallelize:
model_kwargs.update(
_get_accelerate_args(
device_map_option,
device_map_option, # TODO: phase out device_map_option?
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
)
)
elif "device_map" not in model_kwargs:
# set a device_map to initialize model on the right GPU.
# this is needed because it seems that the default behavior
# for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode.
if hasattr(self, "accelerator"):
model_kwargs.update(
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else:
model_kwargs.update({"device_map": {"": str(self.device)}})
if not autogptq:
if model_kwargs.get("load_in_4bit", None):
assert (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment