Fix bug in newer slurm system

0404891e · zhe chen · d36b7c67 · 0404891e · 0404891e
Commit 0404891e authored Jan 17, 2025 by zhe chen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

classification/main.py classification/main.py +3 -1

classification/main_deepspeed.py classification/main_deepspeed.py +3 -1

No files found.
--- a/classification/main.py
+++ b/classification/main.py
@@ -581,7 +581,9 @@ if __name__ == '__main__':
        assert has_native_amp, 'Please update pytorch(1.6+) to support amp!'
    # init distributed env
-    if 'SLURM_PROCID' in os.environ and int(os.environ['SLURM_TASKS_PER_NODE']) != 1:
+    # In the newer versions of Slurm, the format of `SLURM_TASKS_PER_NODE` has changed from a single
+    # numeric string to a format like `8(xn)`, which represents n nodes is used in the training.
+    if 'SLURM_PROCID' in os.environ and int(os.environ['SLURM_TASKS_PER_NODE'][0]) != 1:
        print('\nDist init: SLURM')
        rank = int(os.environ['SLURM_PROCID'])
        gpu = rank % torch.cuda.device_count()

--- a/classification/main_deepspeed.py
+++ b/classification/main_deepspeed.py
@@ -497,7 +497,9 @@ if __name__ == '__main__':
    args, config = parse_option()
    # init distributed env
-    if 'SLURM_PROCID' in os.environ and int(os.environ['SLURM_TASKS_PER_NODE']) != 1:
+    # In the newer versions of Slurm, the format of `SLURM_TASKS_PER_NODE` has changed from a single
+    # numeric string to a format like `8(xn)`, which represents n nodes is used in the training.
+    if 'SLURM_PROCID' in os.environ and int(os.environ['SLURM_TASKS_PER_NODE'][0]) != 1:
        print('\nDist init: SLURM')
        rank = int(os.environ['SLURM_PROCID'])
        gpu = rank % torch.cuda.device_count()