Fix data-parallel evaluation with quantized models (#1270)

* add WIP device_map overrides * update handling outside of accelerate launcher * change .to(device) log to debug level * run linter

Fix data-parallel evaluation with quantized models (#1270)
* add WIP device_map overrides * update handling outside of accelerate launcher * change .to(device) log to debug level * run linter
ef665088 · Hailey Schoelkopf · GitHub · 03e7df51 · ef665088
Unverified Commit ef665088 authored Jan 15, 2024 by Hailey Schoelkopf Committed by GitHub Jan 15, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 4 deletions

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +19 -4

No files found.
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -133,6 +133,8 @@ class HFLM(LM):

            gpus = torch.cuda.device_count()
            accelerator = Accelerator()
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator

            if not (parallelize or accelerator.num_processes > 1):
                # use user-passed device
@@ -202,15 +204,16 @@ class HFLM(LM):
        self.model.tie_weights()

        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
-            if not (parallelize or autogptq or ("device_map" in kwargs)):
+            # TODO: can remove this whole snippet except in the mps case, perhaps?
+            if not (parallelize or autogptq or hasattr(self, "accelerator")):
                # place model onto device requested manually,
                # if not using HF Accelerate or device_map
                # or any other option that preloads model onto device
                try:
                    self.model.to(self.device)
                except ValueError:
-                    eval_logger.info(
-                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
                    )

        self._create_tokenizer(
@@ -456,12 +459,24 @@ class HFLM(LM):
        if parallelize:
            model_kwargs.update(
                _get_accelerate_args(
-                    device_map_option,
+                    device_map_option,  # TODO: phase out device_map_option?
                    max_memory_per_gpu,
                    max_cpu_memory,
                    offload_folder,
                )
            )
+        elif "device_map" not in model_kwargs:
+            # set a device_map to initialize model on the right GPU.
+            # this is needed because it seems that the default behavior
+            # for quantized models now seems to be device_map="auto"
+            # which breaks data-parallel mode.
+            if hasattr(self, "accelerator"):
+                model_kwargs.update(
+                    {"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
+                )
+            else:
+                model_kwargs.update({"device_map": {"": str(self.device)}})
+
        if not autogptq:
            if model_kwargs.get("load_in_4bit", None):
                assert (