Unverified Commit b29eb247 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Split checkpoint from model_name_or_path in examples (#11492)

* Split checkpoint from model_name_or_path in examples

* Address review comments

* Address review comments
parent d6ec54ba
...@@ -65,7 +65,7 @@ examples/pytorch/token-classification/run_ner.py -h ...@@ -65,7 +65,7 @@ examples/pytorch/token-classification/run_ner.py -h
You can resume training from a previous checkpoint like this: You can resume training from a previous checkpoint like this:
1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance). 1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance).
2. Pass `--model_name_or_path path_to_a_specific_checkpoint` to resume training from that checkpoint folder. 2. Pass `--resume_from_checkpoint path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
Should you want to turn an example into a notebook where you'd no longer have access to the command Should you want to turn an example into a notebook where you'd no longer have access to the command
line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`. line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.
......
...@@ -190,7 +190,7 @@ def main(): ...@@ -190,7 +190,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -413,12 +413,11 @@ def main(): ...@@ -413,12 +413,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -199,7 +199,7 @@ def main(): ...@@ -199,7 +199,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -443,12 +443,11 @@ def main(): ...@@ -443,12 +443,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics
......
...@@ -196,7 +196,7 @@ def main(): ...@@ -196,7 +196,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -419,12 +419,11 @@ def main(): ...@@ -419,12 +419,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics
......
...@@ -223,7 +223,7 @@ def main(): ...@@ -223,7 +223,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -398,12 +398,11 @@ def main(): ...@@ -398,12 +398,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics metrics = train_result.metrics
......
...@@ -216,7 +216,7 @@ def main(): ...@@ -216,7 +216,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -557,12 +557,11 @@ def main(): ...@@ -557,12 +557,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -215,7 +215,7 @@ def main(): ...@@ -215,7 +215,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -595,12 +595,11 @@ def main(): ...@@ -595,12 +595,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -272,7 +272,7 @@ def main(): ...@@ -272,7 +272,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -520,12 +520,11 @@ def main(): ...@@ -520,12 +520,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -196,7 +196,7 @@ def main(): ...@@ -196,7 +196,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -448,14 +448,10 @@ def main(): ...@@ -448,14 +448,10 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
checkpoint = None checkpoint = None
if last_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
# Check the config from that potential checkpoint has the right number of labels before using it as a
# checkpoint.
if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels:
checkpoint = model_args.model_name_or_path
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
max_train_samples = ( max_train_samples = (
......
...@@ -335,13 +335,10 @@ def main(): ...@@ -335,13 +335,10 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
checkpoint = None checkpoint = None
if last_checkpoint is not None: if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
# Check the config from that potential checkpoint has the right number of labels before using it as a
# checkpoint.
if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels:
checkpoint = model_args.model_name_or_path
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
max_train_samples = ( max_train_samples = (
......
...@@ -189,7 +189,7 @@ def main(): ...@@ -189,7 +189,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -437,12 +437,11 @@ def main(): ...@@ -437,12 +437,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics metrics = train_result.metrics
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -256,7 +256,7 @@ def main(): ...@@ -256,7 +256,7 @@ def main():
f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome." "Use --overwrite_output_dir to overcome."
) )
elif last_checkpoint is not None: elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info( logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch." "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
...@@ -512,12 +512,11 @@ def main(): ...@@ -512,12 +512,11 @@ def main():
# Training # Training
if training_args.do_train: if training_args.do_train:
if last_checkpoint is not None: checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload trainer.save_model() # Saves the tokenizer too for easy upload
......
...@@ -301,6 +301,11 @@ class TrainingArguments: ...@@ -301,6 +301,11 @@ class TrainingArguments:
:class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
details. details.
resume_from_checkpoint (:obj:`str`, `optional`):
The path to a folder with a valid checkpoint for your model. This argument is not directly used by
:class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
details.
""" """
output_dir: str = field( output_dir: str = field(
...@@ -531,6 +536,10 @@ class TrainingArguments: ...@@ -531,6 +536,10 @@ class TrainingArguments:
push_to_hub: bool = field( push_to_hub: bool = field(
default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."} default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
) )
resume_from_checkpoint: Optional[str] = field(
default=None,
metadata={"help": "The path to a folder with a valid checkpoint for your model."},
)
_n_gpu: int = field(init=False, repr=False, default=-1) _n_gpu: int = field(init=False, repr=False, default=-1)
mp_parameters: str = field( mp_parameters: str = field(
default="", default="",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment