Unverified Commit 297602c7 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[deepspeed] saving checkpoint fallback when fp16 weights aren't saved (#14948)



* [deepspeed] saving checkpoint fallback when fp16 weights aren't saved

* Bump required deepspeed version to match usage when saving checkpoints

* update version
Co-authored-by: default avatarMihai Balint <balint.mihai@gmail.com>
parent d25e25ee
...@@ -98,7 +98,7 @@ _deps = [ ...@@ -98,7 +98,7 @@ _deps = [
"cookiecutter==1.7.2", "cookiecutter==1.7.2",
"dataclasses", "dataclasses",
"datasets", "datasets",
"deepspeed>=0.5.7", "deepspeed>=0.5.9",
"fairscale>0.3", "fairscale>0.3",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
......
...@@ -8,7 +8,7 @@ deps = { ...@@ -8,7 +8,7 @@ deps = {
"cookiecutter": "cookiecutter==1.7.2", "cookiecutter": "cookiecutter==1.7.2",
"dataclasses": "dataclasses", "dataclasses": "dataclasses",
"datasets": "datasets", "datasets": "datasets",
"deepspeed": "deepspeed>=0.5.7", "deepspeed": "deepspeed>=0.5.9",
"fairscale": "fairscale>0.3", "fairscale": "fairscale>0.3",
"faiss-cpu": "faiss-cpu", "faiss-cpu": "faiss-cpu",
"fastapi": "fastapi", "fastapi": "fastapi",
......
...@@ -2054,7 +2054,12 @@ class Trainer: ...@@ -2054,7 +2054,12 @@ class Trainer:
# now save the real model if stage3_gather_fp16_weights_on_model_save=True # now save the real model if stage3_gather_fp16_weights_on_model_save=True
# if false it will not be saved. # if false it will not be saved.
# This must be called on all ranks # This must be called on all ranks
self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME) if not self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME):
logger.warning(
"deepspeed.save_fp16_model didn't save the model, since stage3_gather_fp16_weights_on_model_save=false. "
"Saving the full checkpoint instead, use zero_to_fp32.py to recover weights"
)
self.deepspeed.save_checkpoint(output_dir)
elif self.args.should_save: elif self.args.should_save:
self._save(output_dir) self._save(output_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment