Fix LR auto-scale for multi-tensor optimizers

Reviewed By: stephenyan1231, zhanghang1989 Differential Revision: D30903817 fbshipit-source-id: 578e6b02a1bd59b1bd841399fc60111d320ae9aa

Fix LR auto-scale for multi-tensor optimizers
Reviewed By: stephenyan1231, zhanghang1989 Differential Revision: D30903817 fbshipit-source-id: 578e6b02a1bd59b1bd841399fc60111d320ae9aa
b9aa4855 · Valentin Andrei · Facebook GitHub Bot · 3fd2e635 · b9aa4855 · b9aa4855
Commit b9aa4855 authored Sep 14, 2021 by Valentin Andrei Committed by Facebook GitHub Bot Sep 14, 2021
Showing with 30 additions and 6 deletions

d2go/optimizer/build.py d2go/optimizer/build.py +1 -1

d2go/runner/default_runner.py d2go/runner/default_runner.py +2 -0

tests/modeling/test_optimizer.py tests/modeling/test_optimizer.py +27 -5

No files found.
--- a/d2go/optimizer/build.py
+++ b/d2go/optimizer/build.py
@@ -19,7 +19,7 @@ def reduce_param_groups(param_groups: List[Dict[str, Any]]):
    # with the same lr and weight_decay in a single group. This approach speeds
    # up optimizer step significantly.

-    dict_new_groups: Dict[str, Dict[str, Any]] = {}
+    dict_new_groups: Dict[tuple, Dict[str, Any]] = {}

    for param_group in param_groups:
        # value is a list of parameters from the previous group

--- a/d2go/runner/default_runner.py
+++ b/d2go/runner/default_runner.py
@@ -114,6 +114,8 @@ def default_scale_d2_configs(cfg, new_world_size):
    lr_scales = {
        "sgd": gpu_scale,
        "adamw": 1,
+        "sgd_mt": gpu_scale,
+        "adamw_mt": 1,
    }
    optim_name = cfg.SOLVER.OPTIMIZER.lower()
    lr_scale = lr_scales[optim_name] if optim_name in lr_scales else gpu_scale

--- a/tests/modeling/test_optimizer.py
+++ b/tests/modeling/test_optimizer.py
@@ -2,6 +2,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved


+import random
 import unittest

 import d2go.runner.default_runner as default_runner
@@ -12,30 +13,49 @@ from d2go.optimizer import build_optimizer_mapper
 class TestArch(torch.nn.Module):
    def __init__(self):
        super().__init__()
-        self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
+        self.conv = torch.nn.Conv2d(3, 4, kernel_size=5, stride=1, padding=1)
        self.bn = torch.nn.BatchNorm2d(4)
        self.relu = torch.nn.ReLU(inplace=True)
        self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
+        self.linear = torch.nn.Linear(4, 1)

    def forward(self, x):
        ret = self.conv(x)
        ret = self.bn(ret)
        ret = self.relu(ret)
        ret = self.avgpool(ret)
+        ret = torch.transpose(ret, 1, 3)
+        ret = self.linear(ret)
        return ret


 def _test_each_optimizer(cfg):
+    print("Solver: " + str(cfg.SOLVER.OPTIMIZER))
+
    model = TestArch()
+    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = build_optimizer_mapper(cfg, model)
    optimizer.zero_grad()
-    for _ in range(10):
-        x = torch.rand(1, 3, 24, 24)
-        y = model(x)
-        loss = y.mean()
+
+    random.seed(20210912)
+    for _ in range(2500):
+        target = torch.empty(1, 1, 1, 1).fill_(random.randint(0, 1))
+        x = torch.add(torch.rand(1, 3, 16, 16), 2 * target)
+        y_pred = model(x)
+        loss = criterion(y_pred, target)
        loss.backward()
        optimizer.step()

+    n_correct = 0
+    for _ in range(200):
+        target = torch.empty(1, 1, 1, 1).fill_(random.randint(0, 1))
+        x = torch.add(torch.rand(1, 3, 16, 16), 2 * target)
+        y_pred = torch.round(torch.sigmoid(model(x)))
+        if y_pred == target:
+            n_correct += 1
+
+    print("Correct prediction rate {0}.".format(n_correct / 200))
+

 class TestOptimizer(unittest.TestCase):
    def test_all_optimizers(self):
@@ -45,6 +65,7 @@ class TestOptimizer(unittest.TestCase):

        for optimizer_name in ["SGD", "AdamW", "SGD_MT", "AdamW_MT"]:
            for mult in multipliers:
+                cfg.SOLVER.BASE_LR = 0.01
                cfg.SOLVER.OPTIMIZER = optimizer_name
                cfg.SOLVER.MULTIPLIERS = mult
                _test_each_optimizer(cfg)
@@ -54,6 +75,7 @@ class TestOptimizer(unittest.TestCase):
        cfg = runner.get_default_cfg()

        for optimizer_name in ["SGD", "AdamW", "SGD_MT", "AdamW_MT"]:
+            cfg.SOLVER.BASE_LR = 0.02
            cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.2
            cfg.SOLVER.CLIP_GRADIENTS.ENABLED = True
            cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"