Unverified Commit 650a71e1 authored by Konstantin Dobler's avatar Konstantin Dobler Committed by GitHub
Browse files

Support ratios for `logging_steps`, `eval_steps`, and `save_steps` (#23235)



* Ratio option for `logging_steps`, `eval_steps`, `save_steps`

* Add guards if arguments are not set

* Add more detailed comments + formatting

* Update src/transformers/training_args.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/training_args.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Convert args values to `int` if bigger than 1

* `black`

* `make fixup`

---------
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c34a525d
......@@ -1712,6 +1712,14 @@ class Trainer:
f" {args.max_steps}"
)
# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps and args.logging_steps < 1:
args.logging_steps = math.ceil(max_steps * args.logging_steps)
if args.eval_steps and args.eval_steps < 1:
args.eval_steps = math.ceil(max_steps * args.eval_steps)
if args.save_steps and args.save_steps < 1:
args.save_steps = math.ceil(max_steps * args.save_steps)
if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
if self.args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module
......
......@@ -251,8 +251,9 @@ class TrainingArguments:
logging_first_step (`bool`, *optional*, defaults to `False`):
Whether to log and evaluate the first `global_step` or not.
logging_steps (`int`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`.
logging_steps (`int` or `float`, *optional*, defaults to 500):
Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
or `inf` is filtered and the average loss of the current logging window is taken instead.
......@@ -270,8 +271,9 @@ class TrainingArguments:
- `"no"`: No save is done during training.
- `"epoch"`: Save is done at the end of each epoch.
- `"steps"`: Save is done every `save_steps`.
save_steps (`int`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
save_steps (`int` or `float`, *optional*, defaults to 500):
Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
save_total_limit (`int`, *optional*):
If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
`output_dir`.
......@@ -332,9 +334,10 @@ class TrainingArguments:
dataloader_drop_last (`bool`, *optional*, defaults to `False`):
Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
or not.
eval_steps (`int`, *optional*):
eval_steps (`int` or `float`, *optional*):
Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
value as `logging_steps` if not set.
value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
will be interpreted as ratio of total training steps.
dataloader_num_workers (`int`, *optional*, defaults to 0):
Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
main process.
......@@ -721,13 +724,29 @@ class TrainingArguments:
metadata={"help": "The logging strategy to use."},
)
logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
logging_steps: float = field(
default=500,
metadata={
"help": (
"Log every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."})
save_strategy: Union[IntervalStrategy, str] = field(
default="steps",
metadata={"help": "The checkpoint save strategy to use."},
)
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
save_steps: float = field(
default=500,
metadata={
"help": (
"Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
save_total_limit: Optional[int] = field(
default=None,
metadata={
......@@ -854,7 +873,15 @@ class TrainingArguments:
dataloader_drop_last: bool = field(
default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
)
eval_steps: Optional[int] = field(default=None, metadata={"help": "Run an evaluation every X steps."})
eval_steps: Optional[float] = field(
default=None,
metadata={
"help": (
"Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`."
"If smaller than 1, will be interpreted as ratio of total training steps."
)
},
)
dataloader_num_workers: int = field(
default=0,
metadata={
......@@ -1186,6 +1213,19 @@ class TrainingArguments:
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1:
if self.logging_steps != int(self.logging_steps):
raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
self.logging_steps = int(self.logging_steps)
if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
if self.eval_steps != int(self.eval_steps):
raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
self.eval_steps = int(self.eval_steps)
if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1:
if self.save_steps != int(self.save_steps):
raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}")
self.save_steps = int(self.save_steps)
# Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
if self.load_best_model_at_end:
if self.evaluation_strategy != self.save_strategy:
......@@ -1194,6 +1234,20 @@ class TrainingArguments:
f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
)
if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
if self.eval_steps < 1 or self.save_steps < 1:
if not (self.eval_steps < 1 and self.save_steps < 1):
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
"steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
f"{self.save_steps} and eval_steps {self.eval_steps}."
)
# Work around floating point precision issues
LARGE_MULTIPLIER = 1_000_000
if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
)
raise ValueError(
"--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment