[test]: test adascale with oss (#328)

* [test]: test adascale with oss * minor fix * add a small comment * refactor: moved find_tensor_by_shape * refactor: move test golden data into its own module * refactor: simplied the train function * refactor: added comments as suggested

[test]: test adascale with oss (#328)
* [test]: test adascale with oss * minor fix * add a small comment * refactor: moved find_tensor_by_shape * refactor: move test golden data into its own module * refactor: simplied the train function * refactor: added comments as suggested
fa11d338 · Min Xu · GitHub · 2eb1b8ec · fa11d338 · fa11d338
Unverified Commit fa11d338 authored Jan 28, 2021 by Min Xu Committed by GitHub Jan 28, 2021
5 changed files
--- a/fairscale/utils/golden_testing_data.py
+++ b/fairscale/utils/golden_testing_data.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+""" Golden data used in unit tests. """
+adascale_test_data = [
+    # "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
+    {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
+    {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
+    {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
+    {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
+    {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
+    # "inputs" to trigger multiple iteration tests, which make sure the
+    # smoothing factor calculation is also covered.
+    {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
+]
--- a/fairscale/utils/testing_memory.py
+++ b/fairscale/utils/testing_memory.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+""" Shared functions related to testing GPU memory sizes. """
+import gc
+from typing import Tuple
+import torch
+def find_tensor_by_shape(target_shape: Tuple, only_param: bool = True) -> bool:
+    """ Find a tensor from the heap
+    Args:
+        target_shape (tuple):
+            Tensor shape to locate.
+        only_param (bool):
+            Only match Parameter type (e.g. for weights).
+    Returns:
+        (bool):
+            Return True if found.
+    """
+    for obj in gc.get_objects():
+        try:
+            # Only need to check parameter type objects if asked.
+            if only_param and "torch.nn.parameter.Parameter" not in str(type(obj)):
+                continue
+            if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
+                if obj.shape == target_shape:
+                    return True
+        except Exception as e:
+            pass
+    return False
--- a/tests/optim/test_ddp_adascale.py
+++ b/tests/optim/test_ddp_adascale.py
@@ -22,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import SGD
 from fairscale.optim import AdaScale
+from fairscale.utils.golden_testing_data import adascale_test_data
 from fairscale.utils.testing import skip_if_single_gpu
@@ -60,24 +61,8 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case):
    dist.destroy_process_group()
-# IMPORTANT: make sure these test_cases values are sync'ed with the non-DDP
-# test in test_single_node_adascale.py. This way, we make sure gradient accumulation
-# works exactly like that in DDP.
 @skip_if_single_gpu
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("test_case", adascale_test_data)
-    "test_case",
-    [
-        # "input" value is a list of input tensors for rank 0 and rank 1.
-        {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
-        {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
-        {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
-        {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
-        {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
-        # "inputs" to trigger multiple iteration tests, which make sure the
-        # smoothing factor calculation is also covered.
-        {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
-    ],
-)
 def test_basic(test_case):
    """Test adascale with DDP without gradient accumulation"""
    world_size = 2

--- a/tests/optim/test_oss_adascale.py
+++ b/tests/optim/test_oss_adascale.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# pylint: disable=missing-module-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=missing-function-docstring
+""" Test AdaScale with OSS. """
+from statistics import mean
+import tempfile
+import numpy as np
+import pytest
+import torch
+from torch import Tensor
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn import Linear, Sequential
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import SGD
+from fairscale.optim import OSS, AdaScale
+from fairscale.utils.golden_testing_data import adascale_test_data
+from fairscale.utils.testing import skip_if_single_gpu
+def _dist_init(rank, world_size, tempfile_name, backend):
+    url = "file://" + tempfile_name
+    dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None):
+    _dist_init(rank, world_size, tempfile_name, backend="nccl")
+    if model is None:
+        model = Linear(2, 2, bias=False)
+    model.to("cuda")
+    model = DDP(model, device_ids=[rank])
+    if oss:
+        # For now, we can only wrap AdaScale over OSS. If we do it the other way around,
+        # AdaScale needs to take different parameter types, i.e. the parameter list, etc.
+        optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
+    else:
+        optim = AdaScale(SGD(model.parameters(), lr=0.1))
+    if "input" in test_case:
+        inputs = [test_case["input"]]
+    else:
+        inputs = test_case["inputs"]
+    for in_data in inputs:
+        in_data = Tensor(in_data[rank]).cuda()
+        out = model(in_data)
+        out.sum().backward()
+        optim.step()
+        optim.zero_grad()
+    assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
+    if "expected_mean_weight" in test_case:
+        mean_weight = mean([model.module[i].weight.data.mean().item() for i in range(4)])
+        assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight
+    dist.destroy_process_group()
+@skip_if_single_gpu
+@pytest.mark.parametrize("test_case", adascale_test_data)
+def test_basic(test_case):
+    """Test adascale with DDP + OSS with trivial model"""
+    world_size = 2
+    temp_file_name = tempfile.mkstemp()[1]
+    mp.spawn(_test_basic_func, args=(world_size, temp_file_name, test_case, True), nprocs=world_size, join=True)
+@skip_if_single_gpu
+@pytest.mark.parametrize("oss", [True, False])
+def test_sequential(oss):
+    """Test adascale with DDP + OSS with a sequential model"""
+    world_size = 2
+    temp_file_name = tempfile.mkstemp()[1]
+    # Run multiple iterations, check the gain for both oss and non-oss cases.
+    #
+    # The inputs are picked arbitrarily. I used vectors that are orthogonal.
+    #
+    # The gain and mean_weight values are recorded from my testing and used here
+    # to ensure their value is unchanged from commit to commit unless we can
+    # explain why.
+    test_case = {
+        "inputs": [[[1.0, 0], [0, 1.0]], [[0, 1.0], [1.0, 0]]],
+        "expected_gain": 1.0335265132125744,
+        "expected_mean_weight": 52.92657661437988,
+    }
+    # The model.
+    model = Sequential(
+        Linear(2, 3, bias=False), Linear(3, 4, bias=False), Linear(4, 5, bias=False), Linear(5, 6, bias=False)
+    )
+    # Weights need to be fixed for deterministic gain.
+    model[0].weight.data.copy_(Tensor(range(6)).reshape(3, 2) / mean(range(6)))
+    model[1].weight.data.copy_(Tensor(range(12)).reshape(4, 3) / mean(range(12)))
+    model[2].weight.data.copy_(Tensor(range(20)).reshape(5, 4) / mean(range(20)))
+    model[3].weight.data.copy_(Tensor(range(30)).reshape(6, 5) / mean(range(30)))
+    mp.spawn(_test_basic_func, args=(world_size, temp_file_name, test_case, oss, model), nprocs=world_size, join=True)
--- a/tests/optim/test_single_node_adascale.py
+++ b/tests/optim/test_single_node_adascale.py
@@ -9,7 +9,6 @@
 """ Test AdaScale with a single node (1 CPU or 1 GPU). """
-import gc
 import tempfile
 import numpy as np
@@ -21,7 +20,9 @@ from torch.optim import SGD
 from torch.optim.lr_scheduler import LambdaLR
 from fairscale.optim import AdaScale
+from fairscale.utils.golden_testing_data import adascale_test_data
 from fairscale.utils.testing import skip_if_no_cuda
+from fairscale.utils.testing_memory import find_tensor_by_shape
 def test_basic_cpu():
@@ -58,24 +59,8 @@ def test_loss_accum_cpu():
    # We don't call optim.step(), since it will detect that backward is not yet done.
-# IMPORTANT: make sure these test_cases values are sync'ed with the DDP
-# test in test_ddp_adascale.py. This way, we make sure gradient accumulation
-# works exactly like that in DDP.
 @pytest.mark.parametrize("cpu", [True, False])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("test_case", adascale_test_data)
-    "test_case",
-    [
-        # "input" value is a list of input tensors for micro-batch 0 and micro-batch 1.
-        {"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
-        {"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
-        {"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
-        {"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
-        {"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
-        # "inputs" to trigger multiple iteration tests, which make sure the
-        # smoothing factor calculation is also covered.
-        {"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
-    ],
-)
 def test_grad_accum(test_case, cpu):
    """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
    model = Linear(2, 2, bias=False)
@@ -381,28 +366,12 @@ def test_unhook():
    model = Linear(123, 456, bias=False).cuda()  # unique shape so that it can be found
    optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
-    def find_tensor():
-        """ Find the weight tensor from the heap
-            Return True if found.
-        """
-        for obj in gc.get_objects():
-            try:
-                # Only need to check parameter type objects
-                if "torch.nn.parameter.Parameter" not in str(type(obj)):
-                    continue
-                if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
-                    if obj.shape == (456, 123):
-                        return True
-            except Exception as e:
-                pass
-        return False
    torch.cuda.empty_cache()
-    assert find_tensor(), "something wrong with gc-based method to find the tensor"
+    target_shape = (456, 123)
+    assert find_tensor_by_shape(target_shape), "something wrong with gc-based method to find the tensor"
    optim.unhook()
    del model
    del optim
    torch.cuda.empty_cache()
-    assert not find_tensor(), "tensor should have been released"
+    assert not find_tensor_by_shape(target_shape), "tensor should have been released"