Unverified Commit fa11d338 authored by Min Xu's avatar Min Xu Committed by GitHub
Browse files

[test]: test adascale with oss (#328)

* [test]: test adascale with oss

* minor fix

* add a small comment

* refactor: moved find_tensor_by_shape

* refactor: move test golden data into its own module

* refactor: simplied the train function

* refactor: added comments as suggested
parent 2eb1b8ec
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
""" Golden data used in unit tests. """
adascale_test_data = [
# "input" value is a list of input tensors for micro-batch/rank 0 and micro-batch/rank 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
# "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
]
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
""" Shared functions related to testing GPU memory sizes. """
import gc
from typing import Tuple
import torch
def find_tensor_by_shape(target_shape: Tuple, only_param: bool = True) -> bool:
""" Find a tensor from the heap
Args:
target_shape (tuple):
Tensor shape to locate.
only_param (bool):
Only match Parameter type (e.g. for weights).
Returns:
(bool):
Return True if found.
"""
for obj in gc.get_objects():
try:
# Only need to check parameter type objects if asked.
if only_param and "torch.nn.parameter.Parameter" not in str(type(obj)):
continue
if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
if obj.shape == target_shape:
return True
except Exception as e:
pass
return False
...@@ -22,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP ...@@ -22,6 +22,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import SGD from torch.optim import SGD
from fairscale.optim import AdaScale from fairscale.optim import AdaScale
from fairscale.utils.golden_testing_data import adascale_test_data
from fairscale.utils.testing import skip_if_single_gpu from fairscale.utils.testing import skip_if_single_gpu
...@@ -60,24 +61,8 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case): ...@@ -60,24 +61,8 @@ def _test_basic_func(rank, world_size, tempfile_name, test_case):
dist.destroy_process_group() dist.destroy_process_group()
# IMPORTANT: make sure these test_cases values are sync'ed with the non-DDP
# test in test_single_node_adascale.py. This way, we make sure gradient accumulation
# works exactly like that in DDP.
@skip_if_single_gpu @skip_if_single_gpu
@pytest.mark.parametrize( @pytest.mark.parametrize("test_case", adascale_test_data)
"test_case",
[
# "input" value is a list of input tensors for rank 0 and rank 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
# "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
],
)
def test_basic(test_case): def test_basic(test_case):
"""Test adascale with DDP without gradient accumulation""" """Test adascale with DDP without gradient accumulation"""
world_size = 2 world_size = 2
......
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
# pylint: disable=missing-module-docstring
# pylint: disable=missing-class-docstring
# pylint: disable=missing-function-docstring
""" Test AdaScale with OSS. """
from statistics import mean
import tempfile
import numpy as np
import pytest
import torch
from torch import Tensor
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn import Linear, Sequential
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import SGD
from fairscale.optim import OSS, AdaScale
from fairscale.utils.golden_testing_data import adascale_test_data
from fairscale.utils.testing import skip_if_single_gpu
def _dist_init(rank, world_size, tempfile_name, backend):
url = "file://" + tempfile_name
dist.init_process_group(init_method=url, backend=backend, rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def _test_basic_func(rank, world_size, tempfile_name, test_case, oss, model=None):
_dist_init(rank, world_size, tempfile_name, backend="nccl")
if model is None:
model = Linear(2, 2, bias=False)
model.to("cuda")
model = DDP(model, device_ids=[rank])
if oss:
# For now, we can only wrap AdaScale over OSS. If we do it the other way around,
# AdaScale needs to take different parameter types, i.e. the parameter list, etc.
optim = AdaScale(OSS(model.parameters(), SGD, lr=0.1))
else:
optim = AdaScale(SGD(model.parameters(), lr=0.1))
if "input" in test_case:
inputs = [test_case["input"]]
else:
inputs = test_case["inputs"]
for in_data in inputs:
in_data = Tensor(in_data[rank]).cuda()
out = model(in_data)
out.sum().backward()
optim.step()
optim.zero_grad()
assert np.allclose(optim.gain(), test_case["expected_gain"]), optim.gain()
if "expected_mean_weight" in test_case:
mean_weight = mean([model.module[i].weight.data.mean().item() for i in range(4)])
assert np.allclose(mean_weight, test_case["expected_mean_weight"]), mean_weight
dist.destroy_process_group()
@skip_if_single_gpu
@pytest.mark.parametrize("test_case", adascale_test_data)
def test_basic(test_case):
"""Test adascale with DDP + OSS with trivial model"""
world_size = 2
temp_file_name = tempfile.mkstemp()[1]
mp.spawn(_test_basic_func, args=(world_size, temp_file_name, test_case, True), nprocs=world_size, join=True)
@skip_if_single_gpu
@pytest.mark.parametrize("oss", [True, False])
def test_sequential(oss):
"""Test adascale with DDP + OSS with a sequential model"""
world_size = 2
temp_file_name = tempfile.mkstemp()[1]
# Run multiple iterations, check the gain for both oss and non-oss cases.
#
# The inputs are picked arbitrarily. I used vectors that are orthogonal.
#
# The gain and mean_weight values are recorded from my testing and used here
# to ensure their value is unchanged from commit to commit unless we can
# explain why.
test_case = {
"inputs": [[[1.0, 0], [0, 1.0]], [[0, 1.0], [1.0, 0]]],
"expected_gain": 1.0335265132125744,
"expected_mean_weight": 52.92657661437988,
}
# The model.
model = Sequential(
Linear(2, 3, bias=False), Linear(3, 4, bias=False), Linear(4, 5, bias=False), Linear(5, 6, bias=False)
)
# Weights need to be fixed for deterministic gain.
model[0].weight.data.copy_(Tensor(range(6)).reshape(3, 2) / mean(range(6)))
model[1].weight.data.copy_(Tensor(range(12)).reshape(4, 3) / mean(range(12)))
model[2].weight.data.copy_(Tensor(range(20)).reshape(5, 4) / mean(range(20)))
model[3].weight.data.copy_(Tensor(range(30)).reshape(6, 5) / mean(range(30)))
mp.spawn(_test_basic_func, args=(world_size, temp_file_name, test_case, oss, model), nprocs=world_size, join=True)
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
""" Test AdaScale with a single node (1 CPU or 1 GPU). """ """ Test AdaScale with a single node (1 CPU or 1 GPU). """
import gc
import tempfile import tempfile
import numpy as np import numpy as np
...@@ -21,7 +20,9 @@ from torch.optim import SGD ...@@ -21,7 +20,9 @@ from torch.optim import SGD
from torch.optim.lr_scheduler import LambdaLR from torch.optim.lr_scheduler import LambdaLR
from fairscale.optim import AdaScale from fairscale.optim import AdaScale
from fairscale.utils.golden_testing_data import adascale_test_data
from fairscale.utils.testing import skip_if_no_cuda from fairscale.utils.testing import skip_if_no_cuda
from fairscale.utils.testing_memory import find_tensor_by_shape
def test_basic_cpu(): def test_basic_cpu():
...@@ -58,24 +59,8 @@ def test_loss_accum_cpu(): ...@@ -58,24 +59,8 @@ def test_loss_accum_cpu():
# We don't call optim.step(), since it will detect that backward is not yet done. # We don't call optim.step(), since it will detect that backward is not yet done.
# IMPORTANT: make sure these test_cases values are sync'ed with the DDP
# test in test_ddp_adascale.py. This way, we make sure gradient accumulation
# works exactly like that in DDP.
@pytest.mark.parametrize("cpu", [True, False]) @pytest.mark.parametrize("cpu", [True, False])
@pytest.mark.parametrize( @pytest.mark.parametrize("test_case", adascale_test_data)
"test_case",
[
# "input" value is a list of input tensors for micro-batch 0 and micro-batch 1.
{"input": [[1.0, 0], [0, 1.0]], "expected_gain": 2.0},
{"input": [[1.0, 1.0], [1.0, 1.0]], "expected_gain": 1.0000001249999846},
{"input": [[-1.0, 1.0], [1.0, -1.0]], "expected_gain": 2.0},
{"input": [[1.0, 4.0], [5.0, 0.5]], "expected_gain": 1.5022222222222221},
{"input": [[-0.2, 3.0], [5.0, 0.5]], "expected_gain": 1.9433267229211089},
# "inputs" to trigger multiple iteration tests, which make sure the
# smoothing factor calculation is also covered.
{"inputs": [[[-0.2, 3.3], [5.2, 0.7]], [[1.0, 4.0], [3.1, 0.1]]], "expected_gain": 1.744159431359284},
],
)
def test_grad_accum(test_case, cpu): def test_grad_accum(test_case, cpu):
"""Test the basic functionality on CPU/GPU with gradient accumulation without DDP""" """Test the basic functionality on CPU/GPU with gradient accumulation without DDP"""
model = Linear(2, 2, bias=False) model = Linear(2, 2, bias=False)
...@@ -381,28 +366,12 @@ def test_unhook(): ...@@ -381,28 +366,12 @@ def test_unhook():
model = Linear(123, 456, bias=False).cuda() # unique shape so that it can be found model = Linear(123, 456, bias=False).cuda() # unique shape so that it can be found
optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2) optim = AdaScale(SGD(model.parameters(), lr=0.1), num_gradients_to_accumulate=2)
def find_tensor():
""" Find the weight tensor from the heap
Return True if found.
"""
for obj in gc.get_objects():
try:
# Only need to check parameter type objects
if "torch.nn.parameter.Parameter" not in str(type(obj)):
continue
if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
if obj.shape == (456, 123):
return True
except Exception as e:
pass
return False
torch.cuda.empty_cache() torch.cuda.empty_cache()
assert find_tensor(), "something wrong with gc-based method to find the tensor" target_shape = (456, 123)
assert find_tensor_by_shape(target_shape), "something wrong with gc-based method to find the tensor"
optim.unhook() optim.unhook()
del model del model
del optim del optim
torch.cuda.empty_cache() torch.cuda.empty_cache()
assert not find_tensor(), "tensor should have been released" assert not find_tensor_by_shape(target_shape), "tensor should have been released"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment