[Fix] Fix local debug mode not restrict the resources (#522)

* [Fix] fix local debug mode not restrict the resources * minor fix

[Fix] Fix local debug mode not restrict the resources (#522)
* [Fix] fix local debug mode not restrict the resources * minor fix
b9270c3a · Hubert · GitHub · 229a65f3 · b9270c3a
Unverified Commit b9270c3a authored Oct 30, 2023 by Hubert Committed by GitHub Oct 30, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 23 deletions

opencompass/runners/local.py opencompass/runners/local.py +35 -23

No files found.
--- a/opencompass/runners/local.py
+++ b/opencompass/runners/local.py
@@ -2,6 +2,7 @@ import os
 import os.path as osp
 import re
 import subprocess
+import sys
 import time
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
@@ -19,6 +20,18 @@ from opencompass.utils import get_logger
 from .base import BaseRunner
+def get_command_template(gpu_ids: List[int]) -> str:
+    """Format command template given available gpu ids."""
+    if sys.platform == 'win32':  # Always return win32 for Windows
+        # use command in Windows format
+        tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
+        tmpl += ' & {task_cmd}'
+    else:
+        tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(str(i) for i in gpu_ids)
+        tmpl += ' {task_cmd}'
+    return tmpl
 @RUNNERS.register_module()
 class LocalRunner(BaseRunner):
    """Local runner. Start tasks by local python.
@@ -55,17 +68,36 @@ class LocalRunner(BaseRunner):
        """
        status = []
+        import torch
+        if 'CUDA_VISIBLE_DEVICES' in os.environ:
+            all_gpu_ids = [
+                int(i) for i in re.findall(r'(?<!-)\d+',
+                                           os.getenv('CUDA_VISIBLE_DEVICES'))
+            ]
+        else:
+            all_gpu_ids = list(range(torch.cuda.device_count()))
        if self.debug:
            for task in tasks:
                task = TASKS.build(dict(cfg=task, type=self.task_cfg['type']))
                task_name = task.name
+                num_gpus = task.num_gpus
+                assert len(all_gpu_ids) >= num_gpus
                # get cmd
                mmengine.mkdir_or_exist('tmp/')
                param_file = f'tmp/{os.getpid()}_params.py'
                try:
                    task.cfg.dump(param_file)
-                    cmd = task.get_command(cfg_path=param_file,
+                    # if use torchrun, restrict it behaves the same as non
-                                           template='{task_cmd}')
+                    # debug mode, otherwise, the torchrun will use all the
+                    # available resources which might cause inconsistent
+                    # behavior.
+                    if len(all_gpu_ids) > num_gpus and num_gpus > 0:
+                        get_logger().warning(f'Only use {num_gpus} GPUs for '
+                                             f'total {len(all_gpu_ids)} '
+                                             'available GPUs in debug mode.')
+                    tmpl = get_command_template(all_gpu_ids[:num_gpus])
+                    cmd = task.get_command(cfg_path=param_file, template=tmpl)
                    # run in subprocess if starts with torchrun etc.
                    if cmd.startswith('python'):
                        task.run()
@@ -75,15 +107,6 @@ class LocalRunner(BaseRunner):
                    os.remove(param_file)
                status.append((task_name, 0))
        else:
-            import torch
-            if 'CUDA_VISIBLE_DEVICES' in os.environ:
-                all_gpu_ids = [
-                    int(i) for i in re.findall(
-                        r'(?<!-)\d+', os.getenv('CUDA_VISIBLE_DEVICES'))
-                ]
-            else:
-                all_gpu_ids = list(range(torch.cuda.device_count()))
            if len(all_gpu_ids) > 0:
                gpus = np.zeros(max(all_gpu_ids) + 1, dtype=np.uint)
                gpus[all_gpu_ids] = self.max_workers_per_gpu
@@ -145,18 +168,7 @@ class LocalRunner(BaseRunner):
        param_file = f'tmp/{os.getpid()}_{index}_params.py'
        try:
            task.cfg.dump(param_file)
+            tmpl = get_command_template(gpu_ids)
-            # Build up local command
-            import sys
-            if sys.platform == 'win32':  # Always return win32 for Windows
-                # use command in Windows format
-                tmpl = 'set CUDA_VISIBLE_DEVICES=' + ','.join(
-                    str(i) for i in gpu_ids)
-                tmpl += ' & {task_cmd}'
-            else:
-                tmpl = 'CUDA_VISIBLE_DEVICES=' + ','.join(
-                    str(i) for i in gpu_ids)
-                tmpl += ' {task_cmd}'
            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)