fix: load_best_model_at_end error when load_in_8bit is True (#23443)

Ref: https://github.com/huggingface/peft/issues/394 Loading a quantized checkpoint into non-quantized Linear8bitLt is not supported. call module.cuda() before module.load_state_dict()

fix: load_best_model_at_end error when load_in_8bit is True (#23443)
Ref: https://github.com/huggingface/peft/issues/394 Loading a quantized checkpoint into non-quantized Linear8bitLt is not supported. call module.cuda() before module.load_state_dict()
357f281b · 小桐桐 · GitHub · de5f86e5 · 357f281b
Unverified Commit 357f281b authored May 24, 2023 by 小桐桐 Committed by GitHub May 23, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 8 deletions

src/transformers/trainer.py src/transformers/trainer.py +27 -8

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -2226,16 +2226,35 @@ class Trainer:
                        state_dict["_smp_is_partial"] = False
                        load_result = model.load_state_dict(state_dict, strict=True)
                else:
-                    # We load the model state dict on the CPU to avoid an OOM error.
+                    if hasattr(model, "base_model") and getattr(model.base_model, "is_8bit_serializable", False):
-                    if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                        # If train base_8_bit_models using PEFT & LoRA, assume that adapter have been saved properly.
-                        state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                        if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
+                            if os.path.exists(os.path.join(self.state.best_model_checkpoint, "adapter_model.bin")):
+                                model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
+                                # Load_adapter has no return value present, modify it when appropriate.
+                                from torch.nn.modules.module import _IncompatibleKeys
+                                load_result = _IncompatibleKeys([], [])
+                            else:
+                                logger.warning(
+                                    "The intermediate checkpoints of PEFT may not be saved correctly, "
+                                    "using `TrainerCallback` to save adapter_model.bin in corresponding folders, "
+                                    "here are some examples https://github.com/huggingface/peft/issues/96"
+                                )
+                        else:
+                            # We can't do pure 8bit training using transformers.
+                            logger.warning("Could not loading a quantized checkpoint.")
                    else:
-                        state_dict = torch.load(best_model_path, map_location="cpu")
+                        # We load the model state dict on the CPU to avoid an OOM error.
+                        if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
+                            state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
+                        else:
+                            state_dict = torch.load(best_model_path, map_location="cpu")
-                    # If the model is on the GPU, it still works!
+                        # If the model is on the GPU, it still works!
-                    # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
+                        # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
-                    # which takes *args instead of **kwargs
+                        # which takes *args instead of **kwargs
-                    load_result = model.load_state_dict(state_dict, False)
+                        load_result = model.load_state_dict(state_dict, False)
                if not is_sagemaker_mp_enabled():
                    self._issue_warnings_after_load(load_result)
        elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):