Unverified Commit b191d7db authored by Zachary Mueller's avatar Zachary Mueller Committed by GitHub
Browse files

Update all no_trainer with skip_first_batches (#23664)

parent 26a06814
...@@ -451,22 +451,26 @@ def main(): ...@@ -451,22 +451,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -660,29 +660,27 @@ def main(): ...@@ -660,29 +660,27 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
# need to multiply `gradient_accumulation_steps` to reflect real steps # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch) progress_bar.update(completed_steps)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
if step % args.gradient_accumulation_steps == 0: active_dataloader = train_dataloader
progress_bar.update(1) for step, batch in enumerate(active_dataloader):
completed_steps += 1
continue
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -566,29 +566,27 @@ def main(): ...@@ -566,29 +566,27 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
# need to multiply `gradient_accumulation_steps` to reflect real steps # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch) progress_bar.update(completed_steps)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
if step % args.gradient_accumulation_steps == 0: active_dataloader = train_dataloader
progress_bar.update(1) for step, batch in enumerate(active_dataloader):
completed_steps += 1
continue
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -610,29 +610,27 @@ def main(): ...@@ -610,29 +610,27 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
# need to multiply `gradient_accumulation_steps` to reflect real steps # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch) progress_bar.update(completed_steps)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
if step % args.gradient_accumulation_steps == 0: active_dataloader = train_dataloader
progress_bar.update(1) for step, batch in enumerate(active_dataloader):
completed_steps += 1
continue
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -557,22 +557,26 @@ def main(): ...@@ -557,22 +557,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -809,22 +809,26 @@ def main(): ...@@ -809,22 +809,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -825,22 +825,26 @@ def main(): ...@@ -825,22 +825,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -554,22 +554,26 @@ def main(): ...@@ -554,22 +554,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
model.train() if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
for step, batch in enumerate(train_dataloader): # We skip the first `n` batches in the dataloader when resuming from a checkpoint
# We need to skip steps until we reach the resumed step active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if args.resume_from_checkpoint and epoch == starting_epoch: else:
if resume_step is not None and step < resume_step: active_dataloader = train_dataloader
completed_steps += 1 for step, batch in enumerate(active_dataloader):
continue
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -626,22 +626,26 @@ def main(): ...@@ -626,22 +626,26 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
with accelerator.accumulate(model): with accelerator.accumulate(model):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
......
...@@ -510,12 +510,12 @@ def main(): ...@@ -510,12 +510,12 @@ def main():
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
# We keep track of the loss at each epoch # We keep track of the loss at each epoch
......
...@@ -668,12 +668,12 @@ def main(): ...@@ -668,12 +668,12 @@ def main():
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
completed_steps += 1 active_dataloader = train_dataloader
continue for step, batch in enumerate(active_dataloader):
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
# We keep track of the loss at each epoch # We keep track of the loss at each epoch
......
...@@ -607,28 +607,27 @@ def main(): ...@@ -607,28 +607,27 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
# need to multiply `gradient_accumulation_steps` to reflect real steps # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch) progress_bar.update(completed_steps)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
total_loss = 0 total_loss = 0
for step, batch in enumerate(train_dataloader): if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
# We need to skip steps until we reach the resumed step # We skip the first `n` batches in the dataloader when resuming from a checkpoint
if args.resume_from_checkpoint and epoch == starting_epoch: active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
if resume_step is not None and step < resume_step: else:
if step % args.gradient_accumulation_steps == 0: active_dataloader = train_dataloader
progress_bar.update(1) for step, batch in enumerate(active_dataloader):
completed_steps += 1
continue
outputs = model(**batch) outputs = model(**batch)
loss = outputs.loss loss = outputs.loss
# We keep track of the loss at each epoch # We keep track of the loss at each epoch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment