support FP16 gradient compression

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/70 DDP supports an fp16_compress_hook which compresses the gradient to FP16 before communication. This can result in a significant speed up. Add one argument `_C.MODEL.DDP_FP16_GRAD_COMPRESS` to trigger it. Reviewed By: zhanghang1989 Differential Revision: D28467701 fbshipit-source-id: 3c80865222f48eb8fe6947ea972448c445ee3ef3

support FP16 gradient compression
Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/70 DDP supports an fp16_compress_hook which compresses the gradient to FP16 before communication. This can result in a significant speed up. Add one argument `_C.MODEL.DDP_FP16_GRAD_COMPRESS` to trigger it. Reviewed By: zhanghang1989 Differential Revision: D28467701 fbshipit-source-id: 3c80865222f48eb8fe6947ea972448c445ee3ef3
57809b0f · Zhicheng Yan · Facebook GitHub Bot · daf37a84 · 57809b0f · 57809b0f
Commit 57809b0f authored May 21, 2021 by Zhicheng Yan Committed by Facebook GitHub Bot May 21, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 8 deletions

d2go/utils/get_default_cfg.py d2go/utils/get_default_cfg.py +2 -0

tools/train_net.py tools/train_net.py +10 -8

No files found.
--- a/d2go/utils/get_default_cfg.py
+++ b/d2go/utils/get_default_cfg.py
@@ -59,6 +59,8 @@ def get_default_cfg(_C):
    # Set find_unused_parameters for DistributedDataParallel.
    _C.MODEL.DDP_FIND_UNUSED_PARAMETERS = False
+    # Set FP16 gradient compression for DistributedDataParallel.
+    _C.MODEL.DDP_FP16_GRAD_COMPRESS = False
    # Set default optimizer
    _C.SOLVER.OPTIMIZER = "sgd"

--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -16,7 +16,7 @@ from d2go.setup import (
    setup_after_launch,
 )
 from d2go.utils.misc import print_metrics_table, dump_trained_model_configs
-from torch.nn.parallel import DistributedDataParallel
+from detectron2.engine.defaults import create_ddp_model
 logger = logging.getLogger("d2go.tools.train_net")
@@ -53,13 +53,13 @@ def main(
            "metrics": metrics,
        }
-    if comm.get_world_size() > 1:
+    model = create_ddp_model(
-        model = DistributedDataParallel(
+        model,
-            model,
+        fp16_compression=cfg.MODEL.DDP_FP16_GRAD_COMPRESS,
-            device_ids=None if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()],
+        device_ids=None if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()],
-            broadcast_buffers=False,
+        broadcast_buffers=False,
-            find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS,
+        find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS,
-        )
+    )
    trained_cfgs = runner.do_train(cfg, model, resume=resume)
    metrics = runner.do_test(cfg, model)
@@ -88,6 +88,7 @@ def run_with_cmdline_args(args):
        args=(cfg, output_dir, runner, args.eval_only, args.resume),
    )
 def cli():
    parser = basic_argument_parser(requires_output_dir=False)
    parser.add_argument(
@@ -100,5 +101,6 @@ def cli():
    )
    run_with_cmdline_args(parser.parse_args())
 if __name__ == "__main__":
    cli()