added logging and command line flag --use_elastic to enable torch elastic

Reviewed By: wat3rBro Differential Revision: D46460305 fbshipit-source-id: e91d9312c5d81ef1ba64ab169380329c8ad05f7c

added logging and command line flag --use_elastic to enable torch elastic
Reviewed By: wat3rBro Differential Revision: D46460305 fbshipit-source-id: e91d9312c5d81ef1ba64ab169380329c8ad05f7c
f6afd9a9 · Jessica Zhong · Facebook GitHub Bot · 3ba489fa · f6afd9a9
Commit f6afd9a9 authored Jun 06, 2023 by Jessica Zhong Committed by Facebook GitHub Bot Jun 06, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 12 deletions

d2go/distributed.py d2go/distributed.py +12 -12

No files found.
--- a/d2go/distributed.py
+++ b/d2go/distributed.py
@@ -74,18 +74,6 @@ def distributed_worker(
        )  # set the global shared context from the args passed in by mp spawn
    dist_params = dist_params or DistributedParams.from_environ()
-    if get_launch_environment() == "local" and not torch.cuda.is_available():
-        assert len(args) > 0, args
-        cfg = args[0]
-        if isinstance(cfg, CfgNode) and cfg.MODEL.DEVICE == "cuda":
-            logger.warning(
-                "Detected that CUDA is not available on this machine, set MODEL.DEVICE"
-                " to cpu and backend to GLOO"
-            )
-            with temp_defrost(cfg):
-                cfg.MODEL.DEVICE = "cpu"
-                args.backend = "GLOO"
    with enable_dist_process_groups(backend, init_method, dist_params, timeout):
        d2_comm._LOCAL_PROCESS_GROUP = mcv_comm._LOCAL_PROCESS_GROUP
        # Now the D2's comm module should be fully functional
@@ -119,6 +107,18 @@ def launch(
        - Automatically convert GPU to CPU if CUDA is not available.
        - Add D2Go-specific initialziation in the _distributed_worker.
    """
+    if get_launch_environment() == "local" and not torch.cuda.is_available():
+        assert len(args) > 0, args
+        cfg = args[0]
+        if isinstance(cfg, CfgNode) and cfg.MODEL.DEVICE == "cuda":
+            logger.warning(
+                "Detected that CUDA is not available on this machine, set MODEL.DEVICE"
+                " to cpu and backend to GLOO"
+            )
+            with temp_defrost(cfg):
+                cfg.MODEL.DEVICE = "cpu"
+        backend = "GLOO"
    return _launch(
        main_func=main_func,
        num_processes_per_machine=num_processes_per_machine,