Fix reproducibility in Training for PyTorch 1.11 (#16209)

e1da89cc · Sylvain Gugger · GitHub · e5101c2e · e1da89cc
Unverified Commit e1da89cc authored Mar 17, 2022 by Sylvain Gugger Committed by GitHub Mar 17, 2022
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 3 deletions

src/transformers/trainer.py src/transformers/trainer.py +12 -3

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1354,9 +1354,18 @@ class Trainer:
        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
        if not args.ignore_data_skip:
            for epoch in range(epochs_trained):
+                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
+                    train_dataloader.sampler, RandomSampler
+                )
+                if version.parse(torch.__version__) < version.parse("1.11") or not is_random_sampler:
                    # We just need to begin an iteration to create the randomization of the sampler.
+                    # That was before PyTorch 1.11 however...
                    for _ in train_dataloader:
                        break
+                else:
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
+                    # AT THE VERY END!
+                    _ = list(train_dataloader.sampler)

        for epoch in range(epochs_trained, num_train_epochs):
            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):