"...git@developer.sourcefind.cn:chenpangpang/diffusers.git" did not exist on "80bc0c0ced1566549dec606f5069e909b86e86b0"
Unverified Commit f7d80cb3 authored by Ethan's avatar Ethan Committed by GitHub
Browse files

Fix steps bugs in no trainer examples (#24197)

Fix step bugs in no trainer + load checkpoint + grad acc
parent 08ae37c8
...@@ -453,10 +453,11 @@ def main(): ...@@ -453,10 +453,11 @@ def main():
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_step
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -666,7 +666,7 @@ def main(): ...@@ -666,7 +666,7 @@ def main():
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_steps
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -572,7 +572,7 @@ def main(): ...@@ -572,7 +572,7 @@ def main():
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_steps
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -616,7 +616,7 @@ def main(): ...@@ -616,7 +616,7 @@ def main():
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_steps
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -559,10 +559,11 @@ def main(): ...@@ -559,10 +559,11 @@ def main():
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -811,10 +811,11 @@ def main(): ...@@ -811,10 +811,11 @@ def main():
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -830,7 +830,7 @@ def main(): ...@@ -830,7 +830,7 @@ def main():
resume_step = int(training_difference.replace("step_", "")) resume_step = int(training_difference.replace("step_", ""))
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -556,10 +556,11 @@ def main(): ...@@ -556,10 +556,11 @@ def main():
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -628,10 +628,11 @@ def main(): ...@@ -628,10 +628,11 @@ def main():
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
...@@ -501,10 +501,16 @@ def main(): ...@@ -501,10 +501,16 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step // args.gradient_accumulation_step
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
......
...@@ -659,10 +659,16 @@ def main(): ...@@ -659,10 +659,16 @@ def main():
if "epoch" in training_difference: if "epoch" in training_difference:
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
completed_steps = starting_epoch * num_update_steps_per_epoch
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
......
...@@ -613,7 +613,7 @@ def main(): ...@@ -613,7 +613,7 @@ def main():
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
completed_steps = resume_step completed_steps = resume_step // args.gradient_accumulation_stepp
# update the progress_bar if load from checkpoint # update the progress_bar if load from checkpoint
progress_bar.update(completed_steps) progress_bar.update(completed_steps)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment