Unverified Commit 357f281b authored by 小桐桐's avatar 小桐桐 Committed by GitHub
Browse files

fix: load_best_model_at_end error when load_in_8bit is True (#23443)

Ref: https://github.com/huggingface/peft/issues/394
    Loading a quantized checkpoint into non-quantized Linear8bitLt is not supported.
    call module.cuda() before module.load_state_dict()
parent de5f86e5
...@@ -2226,16 +2226,35 @@ class Trainer: ...@@ -2226,16 +2226,35 @@ class Trainer:
state_dict["_smp_is_partial"] = False state_dict["_smp_is_partial"] = False
load_result = model.load_state_dict(state_dict, strict=True) load_result = model.load_state_dict(state_dict, strict=True)
else: else:
# We load the model state dict on the CPU to avoid an OOM error. if hasattr(model, "base_model") and getattr(model.base_model, "is_8bit_serializable", False):
if self.args.save_safetensors and os.path.isfile(best_safe_model_path): # If train base_8_bit_models using PEFT & LoRA, assume that adapter have been saved properly.
state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu") if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
if os.path.exists(os.path.join(self.state.best_model_checkpoint, "adapter_model.bin")):
model.load_adapter(self.state.best_model_checkpoint, model.active_adapter)
# Load_adapter has no return value present, modify it when appropriate.
from torch.nn.modules.module import _IncompatibleKeys
load_result = _IncompatibleKeys([], [])
else:
logger.warning(
"The intermediate checkpoints of PEFT may not be saved correctly, "
"using `TrainerCallback` to save adapter_model.bin in corresponding folders, "
"here are some examples https://github.com/huggingface/peft/issues/96"
)
else:
# We can't do pure 8bit training using transformers.
logger.warning("Could not loading a quantized checkpoint.")
else: else:
state_dict = torch.load(best_model_path, map_location="cpu") # We load the model state dict on the CPU to avoid an OOM error.
if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
else:
state_dict = torch.load(best_model_path, map_location="cpu")
# If the model is on the GPU, it still works! # If the model is on the GPU, it still works!
# workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
# which takes *args instead of **kwargs # which takes *args instead of **kwargs
load_result = model.load_state_dict(state_dict, False) load_result = model.load_state_dict(state_dict, False)
if not is_sagemaker_mp_enabled(): if not is_sagemaker_mp_enabled():
self._issue_warnings_after_load(load_result) self._issue_warnings_after_load(load_result)
elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)): elif os.path.exists(os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment