[textual_inversion_sdxl.py] fix lr scheduler steps count (#11557)

fix lr scheduler steps count Co-authored-by: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>

[textual_inversion_sdxl.py] fix lr scheduler steps count (#11557)
fix lr scheduler steps count Co-authored-by: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
89ddb6c0 · Yuanzhou Cai · GitHub · be2fb77d · 89ddb6c0
Unverified Commit 89ddb6c0 authored May 29, 2025 by Yuanzhou Cai Committed by GitHub May 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 7 deletions

examples/textual_inversion/textual_inversion_sdxl.py examples/textual_inversion/textual_inversion_sdxl.py +18 -7

No files found.
--- a/examples/textual_inversion/textual_inversion_sdxl.py
+++ b/examples/textual_inversion/textual_inversion_sdxl.py
@@ -793,17 +793,22 @@ def main():
    )
    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
+    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
-        overrode_max_train_steps = True
+        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
+        num_training_steps_for_scheduler = (
+            args.num_train_epochs * num_update_steps_per_epoch * accelerator.num_processes
+        )
+    else:
+        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_warmup_steps=num_warmup_steps_for_scheduler,
-        num_training_steps=args.max_train_steps * accelerator.num_processes,
+        num_training_steps=num_training_steps_for_scheduler,
        num_cycles=args.lr_num_cycles,
    )
@@ -829,8 +834,14 @@ def main():
    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if overrode_max_train_steps:
+    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        if num_training_steps_for_scheduler != args.max_train_steps * accelerator.num_processes:
+            logger.warning(
+                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
+                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
+                f"This inconsistency may result in the learning rate scheduler not functioning properly."
+            )
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)