Unverified Commit 31336dae authored by Pedro Cuenca's avatar Pedro Cuenca Committed by GitHub
Browse files

Fix resume epoch for all training scripts except textual_inversion (#2079)

parent 0e98e839
...@@ -757,14 +757,21 @@ def main(args): ...@@ -757,14 +757,21 @@ def main(args):
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -814,14 +814,21 @@ def main(args): ...@@ -814,14 +814,21 @@ def main(args):
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -660,14 +660,21 @@ def main(): ...@@ -660,14 +660,21 @@ def main():
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -748,14 +748,21 @@ def main(args): ...@@ -748,14 +748,21 @@ def main(args):
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -599,13 +599,21 @@ def main(): ...@@ -599,13 +599,21 @@ def main():
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -651,14 +651,21 @@ def main(): ...@@ -651,14 +651,21 @@ def main():
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Only show the progress bar once on each machine. # Only show the progress bar once on each machine.
progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
......
...@@ -439,14 +439,21 @@ def main(args): ...@@ -439,14 +439,21 @@ def main(args):
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
# Train! # Train!
for epoch in range(first_epoch, args.num_epochs): for epoch in range(first_epoch, args.num_epochs):
......
...@@ -396,13 +396,21 @@ def main(args): ...@@ -396,13 +396,21 @@ def main(args):
dirs = os.listdir(args.output_dir) dirs = os.listdir(args.output_dir)
dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = [d for d in dirs if d.startswith("checkpoint")]
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
path = dirs[-1] path = dirs[-1] if len(dirs) > 0 else None
if path is None:
accelerator.print(
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
)
args.resume_from_checkpoint = None
else:
accelerator.print(f"Resuming from checkpoint {path}") accelerator.print(f"Resuming from checkpoint {path}")
accelerator.load_state(os.path.join(args.output_dir, path)) accelerator.load_state(os.path.join(args.output_dir, path))
global_step = int(path.split("-")[1]) global_step = int(path.split("-")[1])
resume_global_step = global_step * args.gradient_accumulation_steps resume_global_step = global_step * args.gradient_accumulation_steps
first_epoch = resume_global_step // num_update_steps_per_epoch first_epoch = global_step // num_update_steps_per_epoch
resume_step = resume_global_step % num_update_steps_per_epoch resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
for epoch in range(first_epoch, args.num_epochs): for epoch in range(first_epoch, args.num_epochs):
model.train() model.train()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment