Commit f6afd9a9 authored by Jessica Zhong's avatar Jessica Zhong Committed by Facebook GitHub Bot
Browse files

added logging and command line flag --use_elastic to enable torch elastic

Reviewed By: wat3rBro

Differential Revision: D46460305

fbshipit-source-id: e91d9312c5d81ef1ba64ab169380329c8ad05f7c
parent 3ba489fa
...@@ -74,18 +74,6 @@ def distributed_worker( ...@@ -74,18 +74,6 @@ def distributed_worker(
) # set the global shared context from the args passed in by mp spawn ) # set the global shared context from the args passed in by mp spawn
dist_params = dist_params or DistributedParams.from_environ() dist_params = dist_params or DistributedParams.from_environ()
if get_launch_environment() == "local" and not torch.cuda.is_available():
assert len(args) > 0, args
cfg = args[0]
if isinstance(cfg, CfgNode) and cfg.MODEL.DEVICE == "cuda":
logger.warning(
"Detected that CUDA is not available on this machine, set MODEL.DEVICE"
" to cpu and backend to GLOO"
)
with temp_defrost(cfg):
cfg.MODEL.DEVICE = "cpu"
args.backend = "GLOO"
with enable_dist_process_groups(backend, init_method, dist_params, timeout): with enable_dist_process_groups(backend, init_method, dist_params, timeout):
d2_comm._LOCAL_PROCESS_GROUP = mcv_comm._LOCAL_PROCESS_GROUP d2_comm._LOCAL_PROCESS_GROUP = mcv_comm._LOCAL_PROCESS_GROUP
# Now the D2's comm module should be fully functional # Now the D2's comm module should be fully functional
...@@ -119,6 +107,18 @@ def launch( ...@@ -119,6 +107,18 @@ def launch(
- Automatically convert GPU to CPU if CUDA is not available. - Automatically convert GPU to CPU if CUDA is not available.
- Add D2Go-specific initialziation in the _distributed_worker. - Add D2Go-specific initialziation in the _distributed_worker.
""" """
if get_launch_environment() == "local" and not torch.cuda.is_available():
assert len(args) > 0, args
cfg = args[0]
if isinstance(cfg, CfgNode) and cfg.MODEL.DEVICE == "cuda":
logger.warning(
"Detected that CUDA is not available on this machine, set MODEL.DEVICE"
" to cpu and backend to GLOO"
)
with temp_defrost(cfg):
cfg.MODEL.DEVICE = "cpu"
backend = "GLOO"
return _launch( return _launch(
main_func=main_func, main_func=main_func,
num_processes_per_machine=num_processes_per_machine, num_processes_per_machine=num_processes_per_machine,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment