Commit 1b4fc896 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

cleanup + change to name=parallelize

parent 23eaa2db
...@@ -67,7 +67,8 @@ class HFLM(LM): ...@@ -67,7 +67,8 @@ class HFLM(LM):
batch_size=1, batch_size=1,
dtype: Optional[Union[str, torch.dtype]] = "auto", dtype: Optional[Union[str, torch.dtype]] = "auto",
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
use_accelerate: Optional[bool] = False, # only used if `parallelize=True`.
parallelize: Optional[bool] = False,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -81,7 +82,8 @@ class HFLM(LM): ...@@ -81,7 +82,8 @@ class HFLM(LM):
gpus = torch.cuda.device_count() gpus = torch.cuda.device_count()
if gpus <= 1: if gpus <= 1 and not parallelize:
# use user-passed device
if device: if device:
if device not in ["cuda", "cpu"]: if device not in ["cuda", "cpu"]:
device = int(device) device = int(device)
...@@ -95,23 +97,21 @@ class HFLM(LM): ...@@ -95,23 +97,21 @@ class HFLM(LM):
if torch.cuda.is_available() if torch.cuda.is_available()
else torch.device("cpu") else torch.device("cpu")
) )
self._rank = 0
self._world_size = 1
elif not use_accelerate:
self._device = "cpu"
else: else:
eval_logger.info(
f"Passed device '{device}', but using `accelerate launch` or `parallelize=True`. This will be overridden when placing model."
)
# TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = device self._device = device
model_kwargs = {} model_kwargs = {}
if use_accelerate: if parallelize:
model_kwargs = _get_accelerate_args( model_kwargs = _get_accelerate_args(
device_map_option, device_map_option,
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
offload_folder, offload_folder,
) )
print(model_kwargs)
# TODO: update this to be less of a hack once subfolder is fixed in HF # TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "") revision = revision + ("/" + subfolder if subfolder is not None else "")
...@@ -138,11 +138,14 @@ class HFLM(LM): ...@@ -138,11 +138,14 @@ class HFLM(LM):
low_cpu_mem_usage=low_cpu_mem_usage, low_cpu_mem_usage=low_cpu_mem_usage,
**model_kwargs, **model_kwargs,
torch_dtype=utils.get_dtype(dtype), torch_dtype=utils.get_dtype(dtype),
) # .to(self.device) )
# forever after, access self._model through self.model property # forever after, access self._model through self.model property
self.model.eval() self.model.eval()
# TODO: call self.model.tie_weights() here self.model.tie_weights()
if gpus <= 1 and not parallelize:
# place model onto device, if not using HF Accelerate in any form
self.model.to(self.device)
print(self.model.hf_device_map)
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
revision=revision, revision=revision,
...@@ -153,19 +156,19 @@ class HFLM(LM): ...@@ -153,19 +156,19 @@ class HFLM(LM):
self._max_length = max_length self._max_length = max_length
# multithreading and batching # multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size self.batch_size_per_gpu = batch_size
# if use_accelerate: # multigpu data-parallel support when launched with accelerate
# if "lm_head" in self.model.hf_device_map: if gpus > 1:
# # `accelerate` can place `lm_head` weights on a different device than
# # the user specified one so we force `self._device` to be the same as
# # `lm_head`'s.
# self._device = self.model.hf_device_map["lm_head"]
print(self._device, self.model.hf_device_map)
# multigpu support with accelerate
if gpus > 1 and not use_accelerate:
accelerator = Accelerator() accelerator = Accelerator()
if gpus > accelerator.num_processes: if parallelize:
if accelerator.num_processes > 1:
raise RuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
)
else:
pass
elif gpus > accelerator.num_processes:
# TODO: make sure there's still never an edge case where we unintentionally default to CPU # TODO: make sure there's still never an edge case where we unintentionally default to CPU
eval_logger.warning( eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. " "WARNING: The number of total system GPUs does not match the number of spawned processes. "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment