Fix distributed seeding behavior

This adds workers=True to the Lightning seed_everything function which guarantees different random states across all processes in distributed training. Prior to that some processes on different GPUs with the same worker ID could share the same random state. Note that this will break reproducibility between runs prior to and after this change. Also removes the seed and supress_output modules that were not used anymore in OpenFold.

Fix distributed seeding behavior
This adds workers=True to the Lightning seed_everything function which guarantees different random states across all processes in distributed training. Prior to that some processes on different GPUs with the same worker ID could share the same random state. Note that this will break reproducibility between runs prior to and after this change. Also removes the seed and supress_output modules that were not used anymore in OpenFold.
9660a43d · Lukas Jarosch · jnwei · cc6deaa8 · cc6deaa8 · cc6deaa8
Commit 9660a43d authored Mar 19, 2024 by Lukas Jarosch Committed by jnwei May 09, 2024
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 47 deletions

openfold/utils/seed.py openfold/utils/seed.py +0 -19

openfold/utils/suppress_output.py openfold/utils/suppress_output.py +0 -26

train_openfold.py train_openfold.py +2 -2

No files found.
--- a/openfold/utils/seed.py
+++ b/openfold/utils/seed.py
-import os
-import logging
-import random
-import numpy as np
-from pytorch_lightning.utilities.seed import seed_everything
-from openfold.utils.suppress_output import SuppressLogging
-def seed_globally(seed=None):
-    if("PL_GLOBAL_SEED" not in os.environ):
-        if(seed is None):
-            seed = random.randint(0, np.iinfo(np.uint32).max)
-        os.environ["PL_GLOBAL_SEED"] = str(seed)
-        logging.info(f'os.environ["PL_GLOBAL_SEED"] set to {seed}')
-    # seed_everything is a bit log-happy
-    with SuppressLogging(logging.INFO):
-        seed_everything(seed=None)
--- a/openfold/utils/suppress_output.py
+++ b/openfold/utils/suppress_output.py
-import logging
-import sys
-class SuppressStdout:
-    def __enter__(self):
-        self.stdout = sys.stdout
-        dev_null = open("/dev/null", "w")
-        sys.stdout = dev_null
-    def __exit__(self, typ, value, traceback):
-        fp = sys.stdout
-        sys.stdout = self.stdout
-        fp.close()
-class SuppressLogging:
-    def __init__(self, level):
-        self.level = level
-    def __enter__(self):
-        logging.disable(self.level)
-    def __exit__(self, typ, value, traceback):
-        logging.disable(logging.NOTSET)
--- a/train_openfold.py
+++ b/train_openfold.py
@@ -8,6 +8,7 @@ from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.plugins.training_type import DeepSpeedPlugin, DDPPlugin
+from pytorch_lightning.utilities.seed import seed_everything
 import torch
 from openfold.config import model_config
@@ -23,7 +24,6 @@ from openfold.utils.exponential_moving_average import ExponentialMovingAverage
 from openfold.utils.loss import AlphaFoldLoss, lddt_ca
 from openfold.utils.lr_schedulers import AlphaFoldLRScheduler
 from openfold.utils.multi_chain_permutation import multi_chain_permutation_align
-from openfold.utils.seed import seed_everything
 from openfold.utils.superimposition import superimpose
 from openfold.utils.tensor_utils import tensor_tree_map
 from openfold.utils.validation_metrics import (
@@ -273,7 +273,7 @@ class OpenFoldWrapper(pl.LightningModule):
 def main(args):
    if(args.seed is not None):
-        seed_everything(args.seed) 
+        seed_everything(args.seed, workers=True) 
    config = model_config(
        args.config_preset,