Unverified Commit 75259b44 authored by Zachary Mueller's avatar Zachary Mueller Committed by GitHub
Browse files

Properly calculate the total train iterations and recalculate num epochs in...

Properly calculate the total train iterations and recalculate num epochs in no_trainer scripts (#17856)
parent 7c1b9128
...@@ -371,11 +371,11 @@ def main(): ...@@ -371,11 +371,11 @@ def main():
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -391,7 +391,10 @@ def main(): ...@@ -391,7 +391,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -474,11 +474,11 @@ def main(): ...@@ -474,11 +474,11 @@ def main():
model.tie_weights() model.tie_weights()
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -494,7 +494,10 @@ def main(): ...@@ -494,7 +494,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -518,11 +518,11 @@ def main(): ...@@ -518,11 +518,11 @@ def main():
# shorter in multiprocess) # shorter in multiprocess)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -538,7 +538,10 @@ def main(): ...@@ -538,7 +538,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -470,11 +470,11 @@ def main(): ...@@ -470,11 +470,11 @@ def main():
model.to(device) model.to(device)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -490,7 +490,10 @@ def main(): ...@@ -490,7 +490,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -729,11 +729,11 @@ def main(): ...@@ -729,11 +729,11 @@ def main():
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -749,7 +749,10 @@ def main(): ...@@ -749,7 +749,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -747,11 +747,11 @@ def main(): ...@@ -747,11 +747,11 @@ def main():
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -767,7 +767,10 @@ def main(): ...@@ -767,7 +767,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -474,11 +474,11 @@ def main(): ...@@ -474,11 +474,11 @@ def main():
checkpointing_steps = None checkpointing_steps = None
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -494,7 +494,10 @@ def main(): ...@@ -494,7 +494,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Instantiate metric # Instantiate metric
metric = load_metric("mean_iou") metric = load_metric("mean_iou")
......
...@@ -546,8 +546,6 @@ def main(): ...@@ -546,8 +546,6 @@ def main():
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -556,6 +554,9 @@ def main(): ...@@ -556,6 +554,9 @@ def main():
num_training_steps=args.max_train_steps, num_training_steps=args.max_train_steps,
) )
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# 5. Train # 5. Train
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
......
...@@ -540,11 +540,11 @@ def main(): ...@@ -540,11 +540,11 @@ def main():
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -560,7 +560,10 @@ def main(): ...@@ -560,7 +560,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -421,11 +421,11 @@ def main(): ...@@ -421,11 +421,11 @@ def main():
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -439,9 +439,12 @@ def main(): ...@@ -439,9 +439,12 @@ def main():
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
) )
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -536,11 +536,11 @@ def main(): ...@@ -536,11 +536,11 @@ def main():
model.to(device) model.to(device)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -556,7 +556,10 @@ def main(): ...@@ -556,7 +556,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
......
...@@ -521,11 +521,11 @@ def main(): ...@@ -521,11 +521,11 @@ def main():
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Scheduler and math around the number of training steps. # Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None: if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else: overrode_max_train_steps = True
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler(
name=args.lr_scheduler_type, name=args.lr_scheduler_type,
...@@ -541,8 +541,10 @@ def main(): ...@@ -541,8 +541,10 @@ def main():
# We need to recalculate our total training steps as the size of the training dataloader may have changed. # We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch if overrode_max_train_steps:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
# Figure out how many steps we should save the Accelerator states # Figure out how many steps we should save the Accelerator states
if hasattr(args.checkpointing_steps, "isdigit"): if hasattr(args.checkpointing_steps, "isdigit"):
checkpointing_steps = args.checkpointing_steps checkpointing_steps = args.checkpointing_steps
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment