"git@developer.sourcefind.cn:modelzoo/ernie-4.5_vllm.git" did not exist on "c5bb5436e25b6ed41810e127edfec483db55d888"
Unverified Commit 1681a6d4 authored by Zach Mueller's avatar Zach Mueller Committed by GitHub
Browse files

🚨 Fully revert atomic checkpointing 🚨 (#29370)

Fully revert atomic checkpointing
parent 8ef98628
...@@ -2491,21 +2491,13 @@ class Trainer: ...@@ -2491,21 +2491,13 @@ class Trainer:
run_dir = self._get_output_dir(trial=trial) run_dir = self._get_output_dir(trial=trial)
output_dir = os.path.join(run_dir, checkpoint_folder) output_dir = os.path.join(run_dir, checkpoint_folder)
if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0: self.save_model(output_dir, _internal_call=True)
logger.warning(
f"Checkpoint destination directory {output_dir} already exists and is non-empty. "
"Saving will proceed but saved results may be invalid."
)
staging_output_dir = output_dir
else:
staging_output_dir = os.path.join(run_dir, f"tmp-{checkpoint_folder}")
self.save_model(staging_output_dir, _internal_call=True)
if not self.args.save_only_model: if not self.args.save_only_model:
# Save optimizer and scheduler # Save optimizer and scheduler
self._save_optimizer_and_scheduler(staging_output_dir) self._save_optimizer_and_scheduler(output_dir)
# Save RNG state # Save RNG state
self._save_rng_state(staging_output_dir) self._save_rng_state(output_dir)
# Determine the new best metric / best model checkpoint # Determine the new best metric / best model checkpoint
if metrics is not None and self.args.metric_for_best_model is not None: if metrics is not None and self.args.metric_for_best_model is not None:
...@@ -2525,39 +2517,16 @@ class Trainer: ...@@ -2525,39 +2517,16 @@ class Trainer:
# Save the Trainer state # Save the Trainer state
if self.args.should_save: if self.args.should_save:
self.state.save_to_json(os.path.join(staging_output_dir, TRAINER_STATE_NAME)) self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
if self.args.push_to_hub: if self.args.push_to_hub:
self._push_from_checkpoint(staging_output_dir) self._push_from_checkpoint(output_dir)
# Place checkpoint in final location after all saving is finished.
# First wait for everyone to finish writing
self.args.distributed_state.wait_for_everyone()
# Then go through the rewriting process, only renaming and rotating from main process(es)
if self.is_local_process_zero() if self.args.save_on_each_node else self.is_world_process_zero():
if staging_output_dir != output_dir:
if os.path.exists(staging_output_dir):
os.rename(staging_output_dir, output_dir)
# Ensure rename completed in cases where os.rename is not atomic
# And can only happen on non-windows based systems
if os.name != "nt":
fd = os.open(output_dir, os.O_RDONLY)
os.fsync(fd)
os.close(fd)
# Maybe delete some older checkpoints. # Maybe delete some older checkpoints.
if self.args.should_save: if self.args.should_save:
# Solely rely on numerical checkpoint id for rotation. # Solely rely on numerical checkpoint id for rotation.
# mtime is not reliable especially on some fuse fs in cloud environments. # mtime is not reliable especially on some fuse fs in cloud environments.
self._rotate_checkpoints(use_mtime=False, output_dir=run_dir) self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
elif self.is_local_process_zero():
# Clean up the remaining staging checkpoint folders on other nodes
if staging_output_dir != output_dir and os.path.exists(staging_output_dir):
shutil.rmtree(staging_output_dir)
self.args.distributed_state.wait_for_everyone()
def _save_rng_state(self, output_dir): def _save_rng_state(self, output_dir):
# Save RNG state in non-distributed training # Save RNG state in non-distributed training
......
...@@ -84,8 +84,7 @@ from transformers.testing_utils import ( ...@@ -84,8 +84,7 @@ from transformers.testing_utils import (
slow, slow,
torch_device, torch_device,
) )
from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, get_last_checkpoint
from transformers.training_args import OptimizerNames from transformers.training_args import OptimizerNames
from transformers.utils import ( from transformers.utils import (
SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME,
...@@ -1406,19 +1405,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): ...@@ -1406,19 +1405,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer.train() trainer.train()
self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False) self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
def test_save_checkpoints_is_atomic(self):
class UnsaveableTokenizer(PreTrainedTokenizerBase):
def save_pretrained(self, *args, **kwargs):
raise OSError("simulated file write error")
with tempfile.TemporaryDirectory() as tmpdir:
trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
# Attach unsaveable tokenizer to partially fail checkpointing
trainer.tokenizer = UnsaveableTokenizer()
with self.assertRaises(OSError) as _context:
trainer.train()
assert get_last_checkpoint(tmpdir) is None
@require_safetensors @require_safetensors
def test_safe_checkpoints(self): def test_safe_checkpoints(self):
for save_safetensors in [True, False]: for save_safetensors in [True, False]:
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from pathlib import Path
from typing import Dict from typing import Dict
import numpy as np import numpy as np
...@@ -237,20 +236,6 @@ if __name__ == "__main__": ...@@ -237,20 +236,6 @@ if __name__ == "__main__":
trainer.args.eval_accumulation_steps = None trainer.args.eval_accumulation_steps = None
# Check that saving does indeed work with temp dir rotation
# If this fails, will see a FileNotFoundError
model = RegressionModel()
training_args.max_steps = 1
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.LambdaLR(opt, lambda x: 1)
trainer = Trainer(
model, training_args, optimizers=(opt, sched), data_collator=DummyDataCollator(), eval_dataset=dataset
)
trainer._save_checkpoint(model=None, trial=None)
# Check that the temp folder does not exist
assert not (Path(training_args.output_dir) / "tmp-checkpoint-0").exists()
assert (Path(training_args.output_dir) / "checkpoint-0").exists()
# Check that `dispatch_batches=False` will work on a finite iterable dataset # Check that `dispatch_batches=False` will work on a finite iterable dataset
train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1) train_dataset = FiniteIterableDataset(label_names=["labels", "extra"], length=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment