Unverified Commit 96850dfa authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29
parents 87fc4125 cc5f83b5
import argparse
import random
import sys
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from apex import amp
from apex.optimizers import FusedAdam
from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
class TestModel(torch.nn.Module):
def __init__(self, args):
super(TestModel, self).__init__()
self.linear = torch.nn.Sequential(*[torch.nn.Linear(args.dim, args.dim, bias=args.bias) for _ in range(args.layers)])
def forward(self, x):
return self.linear(x)
def setup(args):
## Model
ref_model = TestModel(args).cuda()
dist_model = TestModel(args).cuda()
# Same weights
with torch.no_grad():
for dp, rp in zip(dist_model.parameters(), ref_model.parameters()):
dp.data.copy_(rp.data)
dist_model = dist_model.half()
## Optimizer
# same hyperparameters
ref_opt_args = { 'lr': 1e-3, 'eps': 1e-6, 'weight_decay': 0.01 }
ref_opt = FusedAdam(ref_model.parameters(), **ref_opt_args)
dist_opt_args = ref_opt_args.copy()
dist_opt_args.update( {'overlap_reductions' : False} )
dist_opt_args.update( {'process_group_size' : args.n_gpu} )
dist_opt_args.update( {'dwu_group_size' : args.dwu_group_size} )
dist_opt_args.update( {'dwu_num_blocks' : 1} )
dist_opt_args.update( {'dwu_num_chunks' : 1} )
dist_opt = DistributedFusedAdam(dist_model.parameters(), **dist_opt_args)
dist_opt.set_global_scale(1.)
## amp-init
amp_args = { 'loss_scale' : 'dynamic' , 'opt_level' : 'O2'}
ref_model, ref_opt = amp.initialize(ref_model, ref_opt, **amp_args)
## DDP
ref_model = DDP(ref_model, device_ids=[args.rank])
with torch.no_grad():
for dp in dist_model.parameters():
torch.distributed.broadcast(dp.data, src=0)
for rp in ref_model.parameters():
torch.distributed.broadcast(rp.data, src=0)
torch.cuda.synchronize()
torch.distributed.barrier()
if get_rank() == 0:
print(f'dist opt with {args.n_gpu} GPUs')
return ref_model, ref_opt, dist_model, dist_opt
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=-1)
parser.add_argument('--steps', type=int, default=20)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--dim', type=int, default=4)
parser.add_argument('--layers', type=int, default=2)
parser.add_argument('--bias', action='store_true')
parser.add_argument('--atol', type=float, default=1e-3)
parser.add_argument('--rtol', type=float, default=1)
parser.add_argument('--dwu_group_size', type=float, default=1)
args = parser.parse_args()
return args
def setup_env(args):
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.rank = torch.distributed.get_rank()
args.n_gpu = torch.distributed.get_world_size()
seed = 42 + get_rank()
random.seed(seed)
torch.manual_seed(seed)
return args
def get_rank():
return torch.distributed.get_rank()
def main():
args = parse_args()
args = setup_env(args)
tol_args = { 'atol' : args.atol, 'rtol' : args.rtol }
torch.set_printoptions(precision=16)
ref_model, ref_opt, dist_model, dist_opt = setup(args)
# lazy_init not called yet, initialize stash
stash = ref_opt._amp_stash
stash.all_fp16_params, stash.all_fp32_from_fp16_params = [], []
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt._init_everything()
for i in range(args.steps):
x_ref = torch.randn(args.batch, args.dim, dtype=torch.half).cuda().requires_grad_(True)
x_dist = x_ref.clone().detach().requires_grad_(True)
if get_rank() == 0:
print(f'[{i}] Checking input')
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert(torch.allclose(x_ref, x_dist, **tol_args))
y_ref = ref_model(x_ref).half()
y_dist = dist_model(x_dist)
if get_rank() == 0:
print(f'[{i}] Checking output')
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert(torch.allclose(y_ref, y_dist, **tol_args))
dy = torch.randn_like(y_ref)
y_ref.backward(dy)
y_dist.backward(dy)
if get_rank() == 0:
print(f'[{i}] Checking gradients')
torch.distributed.barrier()
torch.cuda.synchronize()
assert(torch.allclose(x_ref.grad, x_dist.grad, **tol_args))
# gradient all-reduce within distributed optimizer
dist_opt.complete_reductions()
if get_rank() == 0:
print(f'[{i}] Stepping')
ref_opt.step()
dist_opt.step()
torch.cuda.synchronize()
torch.distributed.barrier()
print('Checking new weights')
if get_rank() == 0:
print("ref param:", ref_model.module.linear[0].weight)
print("dist param:", dist_model.linear[0].weight)
for i, (rp, dp) in enumerate(zip(ref_model.parameters(), dist_model.parameters())):
if not torch.allclose(rp, dp, **tol_args):
if get_rank() == 0:
print(f'Rank: {get_rank()}, Param: {i}')
print(f'ref: {rp.sum().item()}, dist: {dp.sum().item()}')
print(rp)
print(dp)
print(torch.abs(rp-dp) > tol_args['atol'])
sys.exit(0)
# zero grads
for rp, dp in zip(ref_model.parameters(), dist_model.parameters()):
rp.grad = None
dp.grad = None
if __name__ == "__main__":
main()
import unittest from itertools import product
import os
import random import random
import unittest
import math
import torch import torch
import apex import apex
from itertools import product
from torch.optim import Optimizer
class TestFusedOptimizer(unittest.TestCase): class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7): def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff self.max_rel_diff = max_rel_diff
self.iters = iters self.iters = iters
torch.cuda.manual_seed(9876) torch.manual_seed(9876)
def tearDown(self): def tearDown(self):
pass pass
def gen_param_optim(self, tensors, options, tst_options=None): def gen_param_optim(self, tensors, options, tst_options=None):
# Adding this to make backward compatible with existing tests. Just in # Adding this to make backward compatible with existing tests. Just in
# case "tst_options" are not provided, it gets a copy of options # case "tst_options" are not provided, it gets a copy of options
# which contains the parameters for the reference optimizer # which contains the parameters for the reference optimizer
if tst_options == None: if tst_options == None:
tst_options = options tst_options = options
ref_param = [] ref_param = []
tst_param = [] tst_param = []
for tensor in tensors: for tensor in tensors:
...@@ -60,11 +59,11 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -60,11 +59,11 @@ class TestFusedOptimizer(unittest.TestCase):
return max_abs_diff, max_rel_diff return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, device='cuda'): def gen_single_type_test(self, param_type=torch.float, device='cuda', *, skip_assert: bool = False):
nelem = 278011 nelem = 278011
# Some ref and test optimizers may require different set of options. # Some ref and test optimizers may require different set of options.
# This is a quick workaround to add that functionality while making # This is a quick workaround to add that functionality while making
# minimum changes in existing code. # minimum changes in existing code.
# If there is no "tst_options" field provided, safe to initialize # If there is no "tst_options" field provided, safe to initialize
# the test optimizer with the parameters of reference optimizer. # the test optimizer with the parameters of reference optimizer.
...@@ -80,6 +79,8 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -80,6 +79,8 @@ class TestFusedOptimizer(unittest.TestCase):
self.gen_grad(ref_param, tst_param) self.gen_grad(ref_param, tst_param)
ref_optim.step() ref_optim.step()
tst_optim.step() tst_optim.step()
if skip_assert:
return
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param) max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
...@@ -87,8 +88,8 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -87,8 +88,8 @@ class TestFusedOptimizer(unittest.TestCase):
class TestFusedAdam(TestFusedOptimizer): class TestFusedAdam(TestFusedOptimizer):
def __init__(self, *args, **kwargs): def setUp(self):
super(TestFusedAdam, self).__init__(*args, **kwargs) super().setUp()
self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay': 0, 'amsgrad': False} 'weight_decay': 0, 'amsgrad': False}
self.ref_optim = torch.optim.Adam self.ref_optim = torch.optim.Adam
...@@ -98,9 +99,15 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -98,9 +99,15 @@ class TestFusedAdam(TestFusedOptimizer):
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
# NOTE(mkozuki): Current threshold values look too small for BFloat16.
# TODO(mkozuki): Refactor `TestFusedOptimizer`
@unittest.skip("NaN issue observed on ROCm as of 12/1/2021. The failing unit test is introduced by a PyTorch commit sometime in between rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0 and 2021/12/01. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/63") @unittest.skip("NaN issue observed on ROCm as of 12/1/2021. The failing unit test is introduced by a PyTorch commit sometime in between rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0 and 2021/12/01. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/63")
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16, skip_assert=True)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
def test_bfloat16(self):
self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82") @unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
......
import inspect
import unittest
from apex.pyprof.prof.data import Data
from apex.pyprof.prof.prof import foo
class TestPyProfData(unittest.TestCase):
def __init__(self, testName):
super().__init__(testName)
def setUp(self):
pass
def tearDown(self):
pass
def test_data(self):
kernels = [
{'kShortName': 'elementwise_kernel', 'kDuration': 2848, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'float', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 104, 160), 'dtype': 'bool'}]}"], 'seqMarker': ['to, seq = 60471'], 'seqId': [60471], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['float'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (585, 1, 1), 'block': (512, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<512, 1, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1})'},
{'kShortName': 'elementwise_kernel', 'kDuration': 201182, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'clone', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 4, 416, 640), 'dtype': 'float32'}]}"], 'seqMarker': ['clone, seq = 60161'], 'seqId': [60161], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['clone'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (37440, 1, 1), 'block': (128, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<128, 4, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2})'},
]
for k in kernels:
d = Data(k)
mod = k['mod']
op = k['op']
xx = foo(mod, op, d)
d.setParams(xx.params())
def run_tests(test_name):
dummy = TestPyProfData(test_name)
test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
print(f'Running tests for {test_name}')
suite = unittest.TestSuite()
for test_case in test_cases:
suite.addTest(TestPyProfData(test_case))
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
run_tests('test_data')
import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
import inspect
import os
import torch
import torch.nn.functional as F
import unittest
from apex import pyprof
pyprof.nvtx.init()
# TODO: add tests for:
# F.bilinear, F.l1_loss, F.multilabel_soft_margin_loss, F.multi_margin_loss
class TestPyProfNvtx(unittest.TestCase):
def __init__(self, testName, dtype=torch.float16):
super().__init__(testName)
self.dtype = dtype
def setUp(self):
pass
def tearDown(self):
pass
def test_conv1d(self):
# Data and weight tensors
tensor1d_in_conv = torch.randn(32, 3, 224, device='cuda', dtype=self.dtype)
tensor1d_in_conv_grouped = torch.randn(32, 6, 224, device='cuda', dtype=self.dtype)
conv1d_filter = torch.randn(16, 3, 3, device='cuda', dtype=self.dtype)
conv1d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv1d
conv1d_out_vanilla = F.conv1d(tensor1d_in_conv, conv1d_filter)
# conv1d with bias
conv1d_out_with_bias = F.conv1d(tensor1d_in_conv, conv1d_filter, bias=conv1d_bias)
# conv1d - stride > 1
conv1d_out_strided = F.conv1d(tensor1d_in_conv, conv1d_filter, stride=2)
# conv1d - dilation > 1
conv1d_out_dilated = F.conv1d(tensor1d_in_conv, conv1d_filter, dilation=2)
# conv1d - groups > 1
conv1d_out_grouped = F.conv1d(tensor1d_in_conv_grouped, conv1d_filter, groups=2)
# conv1d - padding with zeros
conv1d_out_padding_zeros = F.conv1d(tensor1d_in_conv, conv1d_filter, padding=6)
def test_conv2d(self):
# Data and weight tensors
tensor2d_in_conv = torch.randn(32, 3, 224, 224, device='cuda', dtype=self.dtype)
tensor2d_in_conv_grouped = torch.randn(32, 6, 224, 224, device='cuda', dtype=self.dtype)
conv2d_filter = torch.randn(16, 3, 3, 3, device='cuda', dtype=self.dtype)
conv2d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv2d
conv2d_out_vanilla = F.conv2d(tensor2d_in_conv, conv2d_filter)
# conv2d with bias
conv2d_with_bias = F.conv2d(tensor2d_in_conv, conv2d_filter, bias=conv2d_bias)
# conv2d - stride > 1
conv2d_out_strided = F.conv2d(tensor2d_in_conv, conv2d_filter, stride=2)
# conv2d - dilation > 1
conv2d_out_dilated = F.conv2d(tensor2d_in_conv, conv2d_filter, dilation=2)
# conv2d - groups > 1
conv2d_out_grouped = F.conv2d(tensor2d_in_conv_grouped, conv2d_filter, groups=2)
# conv2d - padding with zeros
conv2d_out_padding_zeros = F.conv2d(tensor2d_in_conv, conv2d_filter, padding=6)
def test_conv3d(self):
# Data and weight tensors
tensor3d_in_conv = torch.randn(32, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
tensor3d_in_conv_grouped = torch.randn(32, 6, 16, 224, 224, device='cuda', dtype=self.dtype)
conv3d_filter = torch.randn(16, 3, 3, 3, 3, device='cuda', dtype=self.dtype)
conv3d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv3d
conv3d_out_vanilla = F.conv3d(tensor3d_in_conv, conv3d_filter)
# conv3d - stride > 1
conv3d_out_strided = F.conv3d(tensor3d_in_conv, conv3d_filter, stride=2)
# conv3d - dilation > 1
conv3d_out_dilated = F.conv3d(tensor3d_in_conv, conv3d_filter, dilation=2)
# conv3d - groups > 1
conv3d_out_grouped = F.conv3d(tensor3d_in_conv_grouped, conv3d_filter, groups=2)
# conv3d - padding with zeros
conv3d_out_padding_zeros = F.conv3d(tensor3d_in_conv, conv3d_filter, padding=6)
def test_conv_transpose1d(self):
# Data and weight tensors
conv_transpose1d_tensor = torch.randn(64, 16, 64, device='cuda', dtype=self.dtype)
conv_transpose1d_filter = torch.randn(16, 32, 3, device='cuda', dtype=self.dtype)
conv_transpose1d_bias = torch.randn(32, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose1d_out = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter)
conv_transpose1d_out_biased = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, bias=conv_transpose1d_bias)
conv_transpose1d_out_strided = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, stride=2)
conv_transpose1d_out_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, padding=3)
conv_transpose1d_out2_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, output_padding=2, dilation=3)
conv_transpose1d_out_grouped = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, groups=2)
conv_transpose1d_out_dilated = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, dilation=2)
def test_conv_transpose2d(self):
# Data and weight tensors
conv_transpose2d_tensor = torch.randn(64, 8, 5, 5, device='cuda', dtype=self.dtype)
conv_transpose2d_filter = torch.randn(8, 16, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose2d_bias = torch.randn(16, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose2d_out = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter)
conv_transpose2d_out_biased = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, bias=conv_transpose2d_bias)
conv_transpose2d_out_strided = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, stride=2)
conv_transpose2d_out_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, padding=3)
conv_transpose2d_out2_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, output_padding=2, dilation=3)
conv_transpose2d_out_grouped = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, groups=2)
conv_transpose2d_out_dilated = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, dilation=2)
def test_conv_transpose3d(self):
# Data and weight tensors
conv_transpose3d_tensor = torch.randn(20, 16, 50, 10, 20, device='cuda', dtype=self.dtype)
conv_transpose3d_filter = torch.randn(16, 33, 3, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose3d_bias = torch.randn(33, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose3d_out = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter)
conv_transpose3d_out_biased = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, bias=conv_transpose3d_bias)
conv_transpose3d_out_strided = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, stride=2)
conv_transpose3d_out_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, padding=3)
conv_transpose3d_out2_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, output_padding=2, dilation=3)
conv_transpose3d_out_grouped = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, groups=2)
conv_transpose3d_out_dilated = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, dilation=2)
def test_unfold(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
kernel_size = (4, 5)
inp_unf_dilated = F.unfold(inp, kernel_size, dilation=2)
inp_unf_padded = F.unfold(inp, kernel_size, padding=2)
inp_unf_strided = F.unfold(inp, kernel_size, stride=2)
def test_fold(self):
inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype)
inp_folded = F.fold(inp, (4, 5), (1, 1))
def test_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.avg_pool1d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool2d(self):
inp = torch.randn(1, 3, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool2d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool3d(self):
inp = torch.randn(1, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool3d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_adaptive_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool1d(inp, output_size=5)
def test_adaptive_avg_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool2d(inp, output_size=5)
def test_adaptive_avg_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool3d(inp, output_size=5)
def test_max_pool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
out = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_adaptive_max_pool1d(self):
inp = torch.randn(1, 16, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool1d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool2d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool3d(inp, output_size=5, return_indices=True)
def test_max_unpool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool1d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool2d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool3d(self):
inp = torch.randn(1, 16, 8, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool3d(output, indices, kernel_size=2, stride=2, padding=2)
def test_lp_pool1d(self):
inp = torch.randn(1, 32, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool1d(inp, 2, 3, stride=2, ceil_mode=True)
def test_lp_pool2d(self):
#torch.nn.LPPool2d(norm_type, kernel_size, stride=None, ceil_mode=False)
inp = torch.randn(1, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool2d(inp, 2, 3, stride=2, ceil_mode=True)
def test_threshold(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold(inp, 6, 6, inplace=False)
def test_threshold_(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold_(inp, 6, 6)
def test_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu(inp, inplace=False)
def test_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu_(inp)
def test_hardtanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh(inp, min_val=-1., max_val=1., inplace=False)
def test_hardtanh_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh_(inp, min_val=-1., max_val=1.)
def test_relu6(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu6(inp, inplace=False)
def test_elu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu(inp, alpha=1.0, inplace=False)
def test_elu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu_(inp, alpha=1.0)
def test_selu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.selu(inp)
def test_celu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.celu(inp, alpha=1.0, inplace=False)
def test_leaky_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu(inp, negative_slope=0.01, inplace=False)
def test_leaky_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu_(inp, negative_slope=0.01)
def test_prelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
weight = torch.randn(1, device='cuda', dtype=self.dtype)
output = F.prelu(inp, weight)
def test_rrelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False, inplace=False)
def test_rrelu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False)
def test_glu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.glu(inp, dim=-1)
def test_logsigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.logsigmoid(inp)
def test_hardshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardshrink(inp, lambd=0.5)
def test_tanhshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.tanhshrink(inp)
def test_softsign(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softsign(inp)
def test_softplus(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softplus(inp, beta=1, threshold=20)
def test_softmin(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmin(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmax(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softshrink(inp, lambd=0.5)
def test_gumbel_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.gumbel_softmax(inp, tau=1, hard=False, eps=1e-10, dim=-1)
def test_log_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.log_softmax(inp, dim=-1, _stacklevel=3)
def test_tanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.tanh(inp)
def test_sigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.sigmoid(inp)
def test_batch_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
# running_mean, running_var
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.batch_norm(inp, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05)
def test_instance_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.instance_norm(inp, running_mean=running_mean, running_var=running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-05)
def test_layer_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.layer_norm(inp, inp.size()[1:], weight=None, bias=None, eps=1e-05)
def test_local_response_norm(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.local_response_norm(inp, 2, alpha=0.0001, beta=0.75, k=1.0)
def test_normalize(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.normalize(inp, p=2, dim=1, eps=1e-12, out=None)
def test_linear(self):
inp = torch.randn(32, 64, 128, device='cuda', dtype=self.dtype)
weight = torch.randn(256, 128, device='cuda', dtype=self.dtype)
output = F.linear(inp, weight, bias=None)
def test_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout(inp, p=0.5, training=True, inplace=False)
def test_alpha_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.alpha_dropout(inp, p=0.5, training=True, inplace=False)
def test_dropout2d(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout2d(inp, p=0.5, training=True, inplace=False)
def test_dropout3d(self):
inp = torch.randn(16, 8, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout3d(inp, p=0.5, training=True, inplace=False)
def test_embedding(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding(inp, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)
def test_embedding_bag(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding_bag(inp, weight, offsets=None, max_norm=None, norm_type=2,
scale_grad_by_freq=False, mode='mean', sparse=False)
def test_one_hot(self):
num_classes = 10
inp = torch.randint(0, num_classes, (128, 16), device='cuda')
output = F.one_hot(inp, num_classes=10)
def test_pairwise_distance(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.pairwise_distance(inp1, inp2, p=2.0, eps=1e-06, keepdim=False)
def test_cosine_similarity(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.cosine_similarity(inp1, inp2, dim=1, eps=1e-8)
def test_pdist(self):
# pdist is not implemented for fp16
inp = torch.randn(128, 128, device='cuda', dtype=torch.float32)
output = F.pdist(inp, p=2)
def test_binary_cross_entropy(self):
# binary_cross_entropy is not implemented for fp16
inp = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=False)
output = F.binary_cross_entropy(torch.sigmoid(inp), target)
def test_binary_cross_entropy_with_logits(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.empty_like(inp).random_(2)
output = F.binary_cross_entropy_with_logits(inp, target)
def test_poisson_nll_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.poisson_nll_loss(inp, target, log_input=True, full=False,
size_average=None, eps=1e-08, reduce=None, reduction='mean')
def test_cosine_embedding_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.cosine_embedding_loss(inp1, inp2, target, margin=0,
size_average=None, reduce=None, reduction='mean')
def test_cross_entropy(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 100, (32,), device='cuda', dtype=torch.long, requires_grad=False)
output = F.cross_entropy(inp, target, weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
def test_ctc_loss(self):
# force fp32 because _th_normal_ (used by next line is not supported for fp16)
log_probs = torch.randn(50, 16, 20, device='cuda', dtype=torch.float32).log_softmax(2).detach().requires_grad_()
targets = torch.randint(1, 20, (16, 30), device='cuda', dtype=torch.long)
input_lengths = torch.full((16,), 50, dtype=torch.long)
target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
def test_hinge_embedding_loss(self):
inp = torch.randn(128, 32, device='cuda', dtype=self.dtype)
target = torch.randint(0, 1, (32,), device='cuda') - 1
output = F.hinge_embedding_loss(inp, target, margin=1.0, size_average=None, reduce=None, reduction='mean')
def test_kl_div(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.kl_div(inp, target, size_average=None, reduce=None, reduction='batchmean')
def test_mse_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.mse_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_margin_ranking_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = (torch.randint(0, 1, (128,), device='cuda') - 1).type_as(inp1)
output = F.margin_ranking_loss(inp1, inp2, target, margin=0, size_average=None, reduce=None, reduction='mean')
def test_multilabel_margin_loss(self):
inp = torch.randn(1024, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (1024,), dtype=torch.long, device='cuda')
output = F.multilabel_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_nll_loss(self):
inp = torch.randn(64, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (64,), device='cuda', dtype=torch.long)
output = F.nll_loss(inp, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
def test_smooth_l1_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.smooth_l1_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_soft_margin_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.soft_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_triplet_margin_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp3 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.triplet_margin_loss(inp1, inp2, inp3, margin=1.0, p=2,
eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean')
def test_pixel_shuffle(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = torch.nn.functional.pixel_shuffle(inp, 2)
def test_pad(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
pad = (3, 3)
output = F.pad(inp, pad, mode='constant', value=0)
def test_interpolate(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.interpolate(inp, size=None, scale_factor=2, mode='nearest', align_corners=None)
def test_grid_sample(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
grid = torch.randn(16, 32, 32, 2, device='cuda', dtype=self.dtype)
output = F.grid_sample(inp, grid, mode='bilinear', padding_mode='zeros')
def test_affine_grid(self):
theta = torch.randn(32, 2, 3, device='cuda', dtype=self.dtype)
size = (32, 8, 32, 32)
output = F.affine_grid(theta, size)
def run_tests(precision):
dummy = TestPyProfNvtx('test_affine_grid', None)
test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
print("Running tests for {}".format(precision))
suite = unittest.TestSuite()
for test_case in test_cases:
suite.addTest(TestPyProfNvtx(test_case, precision))
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
run_tests(torch.float32)
run_tests(torch.float16)
#!/bin/bash #!/bin/bash
APEX_TEST_WITH_ROCM=1 python run_test.py APEX_TEST_WITH_ROCM=1 APEX_SKIP_FLAKY_TEST=1 python run_test.py
"""L0 Tests Runner.
How to run this script?
1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
2. Run one of the tests (e.g. fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
3. Run two or more of the tests (e.g. optimizers and fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
"""
import argparse
import os
import unittest import unittest
import sys import sys
from apex.testing.common_utils import TEST_WITH_ROCM from apex.testing.common_utils import TEST_WITH_ROCM
from apex.testing.common_utils import SKIP_FLAKY_TEST
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"] TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
TEST_DIRS = [
ROCM_BLACKLIST = [ "run_amp",
'run_pyprof_nvtx', "run_fp16util",
'run_pyprof_data', "run_optimizers",
"run_fused_layer_norm",
"run_mlp",
"run_transformer", # not fully supported on ROCm
] ]
DEFAULT_TEST_DIRS = [
"run_amp",
"run_fp16util",
"run_optimizers",
"run_fused_layer_norm",
"run_mlp",
]
def parse_args():
parser = argparse.ArgumentParser(
description="L0 test runner",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--include",
nargs="+",
choices=TEST_DIRS,
default=DEFAULT_TEST_DIRS,
help="select a set of tests to run (defaults to ALL tests).",
)
args, _ = parser.parse_known_args()
return args
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0 def main(args):
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
for test_dir in args.include:
test_dir = os.path.join(TEST_ROOT, test_dir)
print(test_dir)
suite = unittest.TestLoader().discover(test_dir)
for test_dir in test_dirs: print("\nExecuting tests from " + test_dir)
if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM: result = runner.run(suite)
continue if not result.wasSuccessful():
suite = unittest.TestLoader().discover(test_dir) errcode = 1
print("\nExecuting tests from " + test_dir) sys.exit(errcode)
result = runner.run(suite)
if not result.wasSuccessful(): if __name__ == '__main__':
errcode = 1 args = parse_args()
main(args)
sys.exit(errcode)
\ No newline at end of file
import subprocess
import os
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
def run_gpt(cmd):
args = list(cmd.split(" "))
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
outs, errs = p.communicate()
outs = list(str((outs).decode("utf-8")).splitlines())
success = False
runtime = 0
num_params = 0
for out in outs:
out = str(out)
if "Average Iteration Time:" in str(out):
slicey = out[out.find(":") + 2 :]
try:
runtime = float(slicey)
except:
print(slicey)
quit()
if "Number of Parameters:" in str(out):
slicey = out[out.find(":") + 2 :]
try:
num_params = int(slicey)
except:
print(slicey)
quit()
if str(out) == str(TEST_SUCCESS_MESSAGE):
success = True
return runtime, round(float(int(num_params)) / 10.0 ** 9, 3), success, errs
def plot(runtimes):
import matplotlib.pyplot as plt
for distributed_setting in runtimes.keys():
plt.scatter(
runtimes[distributed_setting].keys(),
runtimes[distributed_setting].values(),
label=distributed_setting,
)
plt.legend()
plt.xlabel("Parameters (Billions)")
plt.ylabel("Training Iteration time (s)")
plt.title(str("GPT Scaling w/ Offloading"))
plt.savefig("offload_gpt_scaling.png")
plt.close()
if not os.path.exists("/my_workspace/"):
os.system("mkdir /my_workspace/")
os.system("cp *.png /my_workspace/")
def main():
runtimes = {}
nlist = (
list(range(2000, 10000, 2000))
+ list(range(10000, 50000, 5000))
+ list(range(50000, 100000, 10000))
)
print("N-List:", nlist)
for data_parr, tens_parr, pipe_parr in [(8, 1, 1), (4, 2, 1), (2, 1, 4), (1, 2, 4)]:
for offload in [True, False]:
dist_setting = (
"ddp="
+ str(data_parr)
+ ", tensor_parr="
+ str(tens_parr)
+ ", pipe_parr="
+ str(pipe_parr)
+ ", offload="
+ str(offload)
)
runtimes[dist_setting] = {}
print("Beginning Testing for", dist_setting)
for n in nlist:
cmd = "python3 -m torch.distributed.launch --nproc_per_node=8 run_gpt_minimal_test.py"
cmd += (
" --micro-batch-size 1 --num-layers "
+ str(n)
+ " --hidden-size 128 --num-attention-heads 16"
)
cmd += (
" --max-position-embeddings 128 --seq-length 128 --tensor-model-parallel-size "
+ str(tens_parr)
)
cmd += (
" --pipeline-model-parallel-size "
+ str(pipe_parr)
+ (" --cpu-offload" if offload else "")
)
print(cmd)
runtime, bill_params, success, errs = run_gpt(cmd)
if success:
runtimes[dist_setting][bill_params] = runtime
print(
str(runtime) + "s per training iter for",
str(bill_params) + "B parameter GPT-2",
)
if n >= 10000:
plot(runtimes)
else:
print("GPT-2 w/", n, "layers failed using", dist_setting)
print("Moving on to the next distributed setting...")
print("#" * (25))
print()
plot(runtimes)
break
print(runtimes)
plot(runtimes)
if __name__ == "__main__":
main()
import random import random
import torch import torch
try:
import torch_ucc
except ImportError:
HAS_TORCH_UCC = False
else:
HAS_TORCH_UCC = True
print("Use UCC as backend of Pipeline Parallel ProcessGroups")
from apex.transformer.enums import ModelType
from apex.transformer import tensor_parallel from apex.transformer import tensor_parallel
from apex.transformer import parallel_state from apex.transformer import parallel_state
from apex.transformer.log_util import set_logging_level
from apex.transformer.tensor_parallel import vocab_parallel_cross_entropy from apex.transformer.tensor_parallel import vocab_parallel_cross_entropy
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group from apex.transformer.pipeline_parallel.utils import unwrap_model
from apex.transformer.pipeline_parallel.utils import (
average_losses_across_data_parallel_group,
)
from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import build_model from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization from apex.transformer.pipeline_parallel.schedules.common import (
_get_params_for_weight_decay_optimization,
)
from apex.transformer.testing.standalone_bert import bert_model_provider from apex.transformer.testing.standalone_bert import bert_model_provider
from apex.transformer.testing import global_vars from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator from apex.transformer.testing.commons import print_separator
import warnings
class DebugWarning(Warning):
pass
set_logging_level("WARNING")
mode = None mode = None
MANUAL_SEED = 42 MANUAL_SEED = 42
inds = None inds = None
...@@ -26,92 +48,130 @@ EASY_MODE = False ...@@ -26,92 +48,130 @@ EASY_MODE = False
EASY_MODE_SIZ = 32 EASY_MODE_SIZ = 32
ONCE = False ONCE = False
# download a public domain book as corpus
def download_fancy_data(): def download_fancy_data():
#import requests # import requests
#response = requests.get('https://internet.com/book.txt') # response = requests.get('https://internet.com/book.txt')
#text = ' '.join(response.text.split()) # text = ' '.join(response.text.split())
text = """ text = """
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum. An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
""" """
text = text*1024 text = text * 1024
encoded = text.encode('ascii', 'replace') encoded = text.encode("ascii", "replace")
ints = [int(encoded[i]) for i in range(len(encoded))] ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints) return torch.tensor(ints)
# build a batch given sequence_len and batch size # build a batch given sequence_len and batch size
def generate_fancy_data_labels(sequence_len, batch_size): def generate_fancy_data_labels(sequence_len, batch_size):
global data_idx global data_idx
global inds global inds
global masks global masks
global MANUAL_SEED global MANUAL_SEED
temps = list() temps = []
for i in range(batch_size): for i in range(batch_size):
if inds is None or data_idx >= len(inds): if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different # hack as use of RNG will fall out of sync due to pipelines being different
torch.manual_seed(MANUAL_SEED) torch.manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device='cuda') inds = torch.randperm(effective_length, device="cuda")
masks = (torch.rand(len(inds)//batch_size + 1, batch_size, sequence_len, device='cuda') >= MASK_PROB).long() masks = (
MANUAL_SEED += 1 torch.rand(
print("new epoch", len(inds)) len(inds) // batch_size + 1, batch_size, sequence_len, device="cuda"
data_idx = 0 )
print("my start", inds[0:5]) >= MASK_PROB
print("masks_checksum:", torch.sum(masks)) ).long()
if EASY_MODE: MANUAL_SEED += 1
data_idx_ = data_idx % EASY_MODE_SIZ print("new epoch", len(inds))
else: data_idx = 0
data_idx_ = data_idx print("my start", inds[0:5])
offset = inds[data_idx_] #* SEQUENCE_LEN print("masks_checksum:", torch.sum(masks))
data_idx += 1 if EASY_MODE:
data_idx_ = data_idx % EASY_MODE_SIZ
curr = fancy_data[offset:offset+sequence_len].clone().detach() else:
temps.append(curr) data_idx_ = data_idx
temp = torch.stack(temps, dim=0).cuda() offset = inds[data_idx_] # * SEQUENCE_LEN
mask = masks[data_idx//batch_size] data_idx += 1
mask_not = torch.logical_not(mask)
data = mask * temp + mask_not*124 curr = fancy_data[offset : offset + sequence_len].clone().detach()
label = temp temps.append(curr)
return (data, label, mask_not) temp = torch.stack(temps, dim=0).cuda()
mask = masks[data_idx // batch_size]
mask_not = torch.logical_not(mask).long()
data = mask * temp + mask_not * 124
label = temp
if parallel_state.get_tensor_model_parallel_rank() == 0:
data_dict = {"text": data, "label": label, "mask_not": mask_not}
else:
data_dict = None
keys = ["text", "label", "mask_not"]
dtype = torch.int64
broadcasted_data = tensor_parallel.broadcast_data(keys, data_dict, torch.long)
return (
broadcasted_data["text"].long(),
broadcasted_data["label"].long(),
broadcasted_data["mask_not"],
)
easy_data = None easy_data = None
def fwd_step_func(batch, model): def fwd_step_func(batch, model):
data, label, loss_mask = batch data, label, loss_mask = batch
data = data.cuda()
label = label.cuda()
loss_mask = loss_mask.cuda()
y = model(data, torch.ones_like(data), lm_labels=label) y = model(data, torch.ones_like(data), lm_labels=label)
def loss_func(output_tensor): def loss_func(output_tensor):
global ONCE global ONCE
output_tensor, _ = output_tensor output_tensor, _ = output_tensor
lm_loss_ = output_tensor.float() lm_loss_ = output_tensor.float()
lm_loss = torch.sum( lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
averaged_loss = average_losses_across_data_parallel_group([lm_loss]) averaged_loss = average_losses_across_data_parallel_group([lm_loss])
if data_idx >= 1536: if data_idx >= 1536:
assert lm_loss < 4.8 assert averaged_loss < 4.8
if not ONCE: if not ONCE:
print("LOSS OK") print("LOSS OK")
ONCE = True ONCE = True
return lm_loss, {'avg': averaged_loss} return lm_loss, {"avg": averaged_loss}
return y, loss_func return y, loss_func
def train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size): def train(
model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size, async_comm
):
sequence_len = global_vars.get_args().seq_length sequence_len = global_vars.get_args().seq_length
micro_batch_size = global_vars.get_args().micro_batch_size micro_batch_size = global_vars.get_args().micro_batch_size
hidden_size = global_vars.get_args().hidden_size hidden_size = global_vars.get_args().hidden_size
forward_backward_func = get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size) forward_backward_func = get_forward_backward_func(
virtual_pipeline_model_parallel_size, pipeline_model_parallel_size
)
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size) tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
for _ in range(8): for _ in range(16):
batch = generate_fancy_data_labels(sequence_len, batch_size) batch = generate_fancy_data_labels(sequence_len, batch_size)
optim.zero_grad() optim.zero_grad()
forward_backward_func(fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape) forward_backward_func(
fwd_step_func,
batch,
model,
forward_only=False,
tensor_shape=tensor_shape,
async_comm=async_comm,
sequence_parallel_enabled=global_vars.get_args().sequence_parallel,
)
# All-reduce layernorm parameters across model parallel nodes
# when sequence parallelism is used
if parallel_state.get_tensor_model_parallel_world_size() > 1 and global_vars.get_args().sequence_parallel:
for model_module in model:
unwrapped_model = unwrap_model(model_module)
for param in unwrapped_model.parameters():
if getattr(param, 'sequence_parallel_enabled', False):
grad = param.grad
torch.distributed.all_reduce(grad, group=parallel_state.get_tensor_model_parallel_group())
optim.step() optim.step()
if __name__ == '__main__':
if __name__ == "__main__":
global fancy_data global fancy_data
global effective_length global effective_length
...@@ -121,50 +181,80 @@ if __name__ == '__main__': ...@@ -121,50 +181,80 @@ if __name__ == '__main__':
effective_length = fancy_data.size(0) // global_vars.get_args().seq_length effective_length = fancy_data.size(0) // global_vars.get_args().seq_length
effective_length = fancy_data.size(0) - global_vars.get_args().seq_length effective_length = fancy_data.size(0) - global_vars.get_args().seq_length
initialize_distributed("nccl")
initialize_distributed()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
failure = None failure = None
init = True
try: try:
args = global_vars.get_args() virtual_pipeline_model_parallel_sizes = (None, 2,)
args.padded_vocab_size = 128 # needed in standalone gpt if HAS_TORCH_UCC:
batch_size = args.global_batch_size # Deliberately skipping test with interleaved schedule for BERT model.
micro_batch_size = args.micro_batch_size # It deadlocks on hybrid UCC/NCCL backend.
setup_microbatch_calculator( virtual_pipeline_model_parallel_sizes = (None,)
args.rank, for virtual_pipeline_model_parallel_size in virtual_pipeline_model_parallel_sizes:
args.rampup_batch_size, args = global_vars.get_args()
args.global_batch_size, async_comm = not args.sequence_parallel and virtual_pipeline_model_parallel_size is None
args.micro_batch_size, data_idx = 0
1, # args.data_parallel_size, ONCE = False
) if init:
virtual_pipeline_model_parallel_size = 2 init = False
world_size = torch.distributed.get_world_size() args = global_vars.get_args()
pipeline_model_parallel_size = world_size args.padded_vocab_size = 128 # needed in standalone gpt
parallel_state.initialize_model_parallel( args.model_type = ModelType.encoder_or_decoder
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size) batch_size = args.global_batch_size
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() micro_batch_size = args.micro_batch_size
tensor_parallel.random.model_parallel_cuda_manual_seed(0) setup_microbatch_calculator(
model = build_model( args.rank,
bert_model_provider, args.rampup_batch_size,
wrap_with_ddp=True, args.global_batch_size,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, args.micro_batch_size,
) args.data_parallel_size,
assert isinstance(model, list) )
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size) else:
_param_groups = _get_params_for_weight_decay_optimization(model) parallel_state.destroy_model_parallel()
optim = torch.optim.Adam(_param_groups) parallel_state.initialize_model_parallel(
print(effective_length) args.tensor_model_parallel_size,
print(fancy_data.size(0)) args.pipeline_model_parallel_size,
train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size) virtual_pipeline_model_parallel_size,
default_backend="nccl",
p2p_backend="ucc" if HAS_TORCH_UCC else "nccl",
)
pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
tensor_parallel.random.model_parallel_cuda_manual_seed(0)
model = build_model(
bert_model_provider,
wrap_with_ddp=parallel_state.get_data_parallel_world_size() > 1,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
cpu_offload=args.cpu_offload,
)
assert isinstance(model, list)
assert len(model) == (
1
if virtual_pipeline_model_parallel_size is None
else virtual_pipeline_model_parallel_size
)
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
print(effective_length)
print(fancy_data.size(0))
train(
model,
optim,
virtual_pipeline_model_parallel_size,
args.pipeline_model_parallel_size,
async_comm,
)
except Exception as e: except Exception as e:
failure = str(e) failure = str(e)
finally: finally:
parallel_state.destroy_model_parallel() parallel_state.destroy_model_parallel()
if failure is not None: if failure is not None:
torch.distributed.barrier() warnings.warn(
if torch.distributed.get_rank() == 0: f"Minimal BERT Pipeline Parallel Failed with: {failure}", DebugWarning
print(f"Minimal BERT Pipeline Parallel Failed with: {failure}") )
else: print(f"Minimal BERT Pipeline Parallel Failed with: {failure}")
torch.distributed.barrier() torch.distributed.barrier()
if torch.distributed.get_rank() == 0: print(TEST_SUCCESS_MESSAGE)
print(TEST_SUCCESS_MESSAGE)
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.functional as F
from apex.transformer import parallel_state
from apex.transformer import tensor_parallel
from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import set_random_seed
from apex.transformer.testing.commons import IdentityLayer
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def torch_cross_entropy(batch_size, seq_length, vocab_size,
logits_scale, seed):
set_random_seed(seed)
identity = IdentityLayer((batch_size, seq_length, vocab_size),
scale=logits_scale).cuda()
logits = identity()
target = torch.cuda.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size)
loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
target.view(-1),
reduction='none').view_as(target).mean()
loss.backward()
return loss, identity.weight.grad
def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
set_random_seed(seed)
identity = IdentityLayer((batch_size, seq_length, vocab_size), scale=logits_scale).cuda()
logits = identity()
logits_parallel = tensor_parallel.scatter_to_tensor_model_parallel_region(logits)
target = torch.cuda.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size)
logits_parallel_ = logits_parallel.clone().detach()
loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
loss.backward()
# check for mutation
assert torch.equal(logits_parallel_, logits_parallel)
return loss, identity.weight.grad
def test_cross_entropy(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing cross entropy with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
batch_size = 13
seq_length = 17
vocab_size_per_partition = 11
logits_scale = 1000.0
vocab_size = vocab_size_per_partition * tensor_model_parallel_size
seed = 1234
loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
loss_mpu, grad_mpu = tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
error = loss_torch.sub_(loss_mpu).abs().max()
print(' max error in loss on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = grad_torch.sub_(grad_mpu).abs().max()
print(' max error in grad on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test cross entropy')
test_cross_entropy(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import operator
import torch
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import data as data_utils
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_broadcast_data(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing broadcast_data with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
torch.manual_seed(1234 + parallel_state.get_data_parallel_rank())
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
key_size_t = {
'key1': [7, 11],
'key2': [8, 2, 1],
'key3': [13],
'key4': [5, 1, 2],
'key5': [5, 12],
}
keys = list(key_size_t.keys())
data = {}
data_t = {}
for key in key_size_t:
data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
data_t[key] = data[key].clone()
data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
data_t['keyX'] = data['keyX'].clone()
if parallel_state.get_tensor_model_parallel_rank() != 0:
data = None
data_utils._check_data_types(keys, data_t, torch.int64)
key_size, key_numel, \
total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
for key in keys:
assert key_size[key] == key_size_t[key]
total_numel_t = 0
for key in keys:
target_size = functools.reduce(operator.mul, key_size_t[key], 1)
assert key_numel[key] == target_size
total_numel_t += target_size
assert total_numel == total_numel_t
data_b = data_utils.broadcast_data(keys, data, torch.int64)
for key in keys:
tensor = data_t[key].cuda()
assert data_b[key].sub(tensor).abs().max() == 0
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test test broadcast data')
test_broadcast_data(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
...@@ -11,7 +11,6 @@ from apex.transformer.pipeline_parallel.schedules.common import build_model ...@@ -11,7 +11,6 @@ from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import ( from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import (
_forward_backward_pipelining_with_interleaving, _forward_backward_pipelining_with_interleaving,
) )
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches from apex.transformer.pipeline_parallel.utils import update_num_microbatches
...@@ -19,6 +18,7 @@ from apex.transformer.testing import global_vars ...@@ -19,6 +18,7 @@ from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import fwd_step_func
from apex.transformer.log_util import get_transformer_logger, set_logging_level from apex.transformer.log_util import get_transformer_logger, set_logging_level
from apex.transformer.testing.commons import model_provider_func from apex.transformer.testing.commons import model_provider_func
from apex.transformer._data import MegatronPretrainingRandomSampler from apex.transformer._data import MegatronPretrainingRandomSampler
...@@ -31,7 +31,7 @@ _logger = get_transformer_logger("pipeline_parallel_test") ...@@ -31,7 +31,7 @@ _logger = get_transformer_logger("pipeline_parallel_test")
# note(mkozuki): To see if local batch size increases, uncomment the line below # note(mkozuki): To see if local batch size increases, uncomment the line below
# _logger.setLevel("INFO") # _logger.setLevel("INFO")
global_vars.set_global_variables( global_vars.set_global_variables(
args_defaults={"global_batch_size": 512, "rampup_batch_size": [32, 32, 1000],}, args_defaults={"global_batch_size": 512, "rampup_batch_size": [64, 64, 1000],},
ignore_unknown_args=True, ignore_unknown_args=True,
) )
...@@ -44,29 +44,13 @@ HIDDEN_SIZE = 16 ...@@ -44,29 +44,13 @@ HIDDEN_SIZE = 16
def Dataset(num_samples: int) -> List[Tuple[torch.Tensor, torch.Tensor]]: def Dataset(num_samples: int) -> List[Tuple[torch.Tensor, torch.Tensor]]:
return [(torch.randn(HIDDEN_SIZE), torch.randn(HIDDEN_SIZE // 2)) for _ in range(num_samples)] return [
(
torch.randn(HIDDEN_SIZE, HIDDEN_SIZE),
def process_batch(batch): torch.randn(HIDDEN_SIZE // 2, HIDDEN_SIZE // 2),
if isinstance(batch, (list, tuple)): )
x = batch[0] for _ in range(num_samples)
else: ]
x = batch
return x
def fwd_step_func(micro_batch, model):
x = process_batch(micro_batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {"avg": averaged_loss}
return y, loss_func
# Run forward & backward with dynamic batch size. # Run forward & backward with dynamic batch size.
...@@ -88,9 +72,13 @@ def run_interleaved_with_dynamic_batch_size( ...@@ -88,9 +72,13 @@ def run_interleaved_with_dynamic_batch_size(
parallel_state.initialize_model_parallel( parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size 1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size
) )
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size() pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
print_separator(f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}") print_separator(
f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}"
)
model = build_model( model = build_model(
model_provider_func, model_provider_func,
...@@ -122,7 +110,7 @@ def run_interleaved_with_dynamic_batch_size( ...@@ -122,7 +110,7 @@ def run_interleaved_with_dynamic_batch_size(
assert isinstance(batch, (list, tuple)) assert isinstance(batch, (list, tuple))
return [get_num_samples(b) for b in batch] return [get_num_samples(b) for b in batch]
tensor_shape = [micro_batch_size, HIDDEN_SIZE] tensor_shape = [micro_batch_size, HIDDEN_SIZE, HIDDEN_SIZE]
consumed_samples = 0 consumed_samples = 0
for i in range(NUM_ITERATIONS): for i in range(NUM_ITERATIONS):
update_num_microbatches(consumed_samples, consistency_check=False) update_num_microbatches(consumed_samples, consistency_check=False)
...@@ -180,7 +168,10 @@ if __name__ == "__main__": ...@@ -180,7 +168,10 @@ if __name__ == "__main__":
args.micro_batch_size, args.micro_batch_size,
1, # args.data_parallel_size, 1, # args.data_parallel_size,
) )
for BatchSamplerCls in (MegatronPretrainingSampler, MegatronPretrainingRandomSampler): for BatchSamplerCls in (
MegatronPretrainingSampler,
MegatronPretrainingRandomSampler,
):
for forward_only in (False, True): for forward_only in (False, True):
n_tests += 1 n_tests += 1
pipeline_model_parallel_size = world_size pipeline_model_parallel_size = world_size
......
from functools import partial
from typing import List
import time
import torch
try:
import torch_ucc
except ImportError:
HAS_TORCH_UCC = False
else:
HAS_TORCH_UCC = True
print("Use UCC as backend of Pipeline Parallel ProcessGroups")
from apex.transformer import parallel_state
from apex.transformer.enums import ModelType
from apex.transformer.tensor_parallel import model_parallel_cuda_manual_seed
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import unwrap_model
from apex.transformer.pipeline_parallel.utils import (
average_losses_across_data_parallel_group,
)
from apex.transformer.pipeline_parallel.utils import get_ltor_masks_and_position_ids
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.common import (
_get_params_for_weight_decay_optimization,
)
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import (
forward_backward_pipelining_without_interleaving,
)
from apex.transformer.testing.standalone_gpt import gpt_model_provider
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
MANUAL_SEED = 42
inds = None
data_idx = 0
N_VOCAB = 128
def download_fancy_data():
# import requests
# response = requests.get('https://internet.com/book.txt')
# text = ' '.join(response.text.split())
text = """
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
"""
text = text * 1024
encoded = text.encode("ascii", "replace")
ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints)
# build a batch given sequence_len and batch size
def generate_fancy_data_labels(sequence_len, batch_size):
global data_idx
global inds
global MANUAL_SEED
temps = list()
for i in range(batch_size):
if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different
model_parallel_cuda_manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device="cuda")
MANUAL_SEED += 1
data_idx = 0
data_idx_ = data_idx
offset = inds[data_idx_]
data_idx += 1
curr = fancy_data[offset : offset + sequence_len + 1].clone().detach()
temps.append(curr)
temp = torch.stack(temps, dim=0).cuda()
return temp
easy_data = None
def get_batch(int_tensors: List[torch.Tensor]):
data = int_tensors[0]
# Unpack.
tokens_ = data.long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and position ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
N_VOCAB, # tokenizer.eod,
False, # args.reset_position_ids,
False, # args.reset_attention_mask,
False, # args.eod_mask_loss,
)
return tokens, labels, loss_mask, attention_mask, position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {"lm loss": averaged_loss[0]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
def fwd_step_func(batch, model):
"""Forward step."""
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(batch)
output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def train(model, optim, pipeline_model_parallel_size, async_comm):
sequence_len = global_vars.get_args().seq_length
micro_batch_size = global_vars.get_args().micro_batch_size
hidden_size = global_vars.get_args().hidden_size
fwd_bwd_func = forward_backward_pipelining_without_interleaving
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
runtime = 0
# training loop
for i in range(3):
since = time.time()
if torch.distributed.get_rank() == 0:
print("begin iter", i)
batch = [
generate_fancy_data_labels(args.seq_length, args.global_batch_size)
for _ in range(pipeline_model_parallel_size)
]
if torch.distributed.get_rank() == 0:
print("finished making batch...")
optim.zero_grad()
fwd_bwd_func(
fwd_step_func,
batch,
model,
forward_only=False,
tensor_shape=tensor_shape,
async_comm=async_comm,
sequence_parallel_enabled=args.sequence_parallel,
)
if torch.distributed.get_rank() == 0:
print("finished forward step")
# All-reduce layernorm parameters across model parallel nodes
# when sequence parallelism is used
if parallel_state.get_tensor_model_parallel_world_size() > 1 and global_vars.get_args().sequence_parallel:
for model_module in model:
unwrapped_model = unwrap_model(model_module)
for param in unwrapped_model.parameters():
if getattr(param, 'sequence_parallel_enabled', False):
grad = param.grad
torch.distributed.all_reduce(grad, group=parallel_state.get_tensor_model_parallel_group())
optim.step()
if torch.distributed.get_rank() == 0:
print("finished iter", i)
runtime += time.time() - since
return runtime / 3.0
if __name__ == "__main__":
init = True
global_vars.set_global_variables()
for async_comm in (False,) if global_vars.get_args().sequence_parallel else (False, True):
global fancy_data
global effective_length
if init:
init = False
fancy_data = download_fancy_data()
args = global_vars.get_args()
args.model_type = ModelType.encoder_or_decoder
effective_length = fancy_data.size(0) // args.seq_length
effective_length = fancy_data.size(0) - args.seq_length
initialize_distributed("nccl")
world_size = torch.distributed.get_world_size()
failure = None
args.padded_vocab_size = 128
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
args.data_parallel_size, # args.data_parallel_size,
)
world_size = torch.distributed.get_world_size()
print(args.tensor_model_parallel_size, "MODEL PARALLEL SIZE")
parallel_state.initialize_model_parallel(
tensor_model_parallel_size_=args.tensor_model_parallel_size,
pipeline_model_parallel_size_=args.pipeline_model_parallel_size,
default_backend="nccl",
p2p_backend="ucc" if HAS_TORCH_UCC else "nccl",
)
pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
model_parallel_cuda_manual_seed(0)
model = build_model(
gpt_model_provider,
wrap_with_ddp=parallel_state.get_data_parallel_world_size() > 1,
virtual_pipeline_model_parallel_size=None,
cpu_offload=args.cpu_offload,
)
assert isinstance(model, list), model
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
runtime = train(model, optim, args.pipeline_model_parallel_size, async_comm)
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
print("Average Iteration Time:", runtime)
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex.transformer import parallel_state
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_initialize_model_parallel(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing initialize_model_parallel with size {} ...'.format(
tensor_model_parallel_size))
tensor_model_parallel_size_ = min(
tensor_model_parallel_size,
torch.distributed.get_world_size(),
)
assert not parallel_state.model_parallel_is_initialized()
parallel_state.initialize_model_parallel(tensor_model_parallel_size_)
assert parallel_state.model_parallel_is_initialized()
# Checks.
def check(group, world_size, rank):
assert world_size == torch.distributed.get_world_size(group=group)
assert rank == torch.distributed.get_rank(group=group)
# Model parallel.
world_size = tensor_model_parallel_size_
rank = torch.distributed.get_rank() % tensor_model_parallel_size_
assert world_size == parallel_state.get_tensor_model_parallel_world_size()
assert rank == parallel_state.get_tensor_model_parallel_rank()
check(parallel_state.get_tensor_model_parallel_group(), world_size, rank)
# Data parallel.
world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
rank = torch.distributed.get_rank() // tensor_model_parallel_size
assert world_size == parallel_state.get_data_parallel_world_size()
assert rank == parallel_state.get_data_parallel_rank()
check(parallel_state.get_data_parallel_group(), world_size, rank)
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
if torch.distributed.get_rank() == 0:
print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
tensor_model_parallel_size_))
tensor_model_parallel_size = min(
tensor_model_parallel_size_,
torch.distributed.get_world_size(),
)
assert not parallel_state.model_parallel_is_initialized()
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
assert parallel_state.model_parallel_is_initialized()
# Checks
src_rank = torch.distributed.get_rank() - parallel_state.get_tensor_model_parallel_rank()
assert parallel_state.get_tensor_model_parallel_src_rank() == src_rank
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print('>> passed the test :-)')
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test initialize model parallel')
test_initialize_model_parallel(tensor_model_parallel_size)
print_separator('test model parallel source rank')
test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.init as init
from torch.nn.parameter import Parameter
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import layers
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import set_random_seed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
class IdentityLayer3D(torch.nn.Module):
def __init__(self, m, n, k):
super(IdentityLayer3D, self).__init__()
self.weight = Parameter(torch.Tensor(m, n, k))
torch.nn.init.xavier_normal_(self.weight)
def forward(self):
return self.weight
def test_parallel_embedding(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing parallel embedding with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
batch_size = 17
seq_length = 23
vocab_size = 48
hidden_size = 16
seed = 1236
set_random_seed(123)
input_data = torch.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
set_random_seed(seed)
embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
output = embedding_original(input_data)
loss_original = torch.mul(output, loss_weight).sum()
loss_original.backward()
set_random_seed(seed)
embedding_parallel = layers.ParallelEmbedding(
vocab_size, hidden_size, init_method=init.normal_).cuda()
output = embedding_parallel(input_data)
loss_parallel = torch.mul(output, loss_weight).sum()
loss_parallel.backward()
set_random_seed(seed)
embedding_vocab_parallel = layers.VocabParallelEmbedding(
vocab_size, hidden_size, init_method=init.normal_).cuda()
output = embedding_vocab_parallel(input_data)
loss_vocab_parallel = torch.mul(output, loss_weight).sum()
loss_vocab_parallel.backward()
torch.distributed.barrier()
error = loss_parallel.sub(loss_original).abs()
print(' error in loss (parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
torch.distributed.barrier()
error = loss_vocab_parallel.sub(loss_original).abs()
print(' error in loss (vocab parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
weight_grad_orig = torch.split(embedding_original.weight.grad,
hidden_size // tensor_model_parallel_size,
1)[parallel_state.get_tensor_model_parallel_rank()]
error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
print(' error in grad (parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
weight_grad_orig = torch.split(embedding_original.weight.grad,
vocab_size // tensor_model_parallel_size,
0)[parallel_state.get_tensor_model_parallel_rank()]
error = embedding_vocab_parallel.weight.grad.sub(
weight_grad_orig).abs().max()
print(' error in grad (vocab parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print('>> passed the test :-)')
def test_initialize_affine_weight(tensor_model_parallel_size, device):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing initialize_affine_weight with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
# ---------------
# Column parallel
# ---------------
weight = torch.empty(output_size_coeff, input_size)
set_random_seed(seed)
if device == 'cpu':
layers._initialize_affine_weight_cpu(weight, output_size, input_size,
output_size_coeff, 0,
torch.nn.init.normal_,
params_dtype=global_vars.get_args().params_dtype,
)
else:
layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 0)
# Target.
set_random_seed(seed)
master_weight = torch.empty(output_size, input_size)
torch.nn.init.normal_(master_weight)
rank = parallel_state.get_tensor_model_parallel_rank()
my_weight = torch.split(master_weight, output_size_coeff,
dim=0)[rank].contiguous().clone()
# Compare.
error = weight.sub(my_weight).abs().max()
torch.distributed.barrier()
print(' column parallel max error (should be zero) on global rank '
'{}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# ------------
# Row parallel
# ------------
weight = torch.empty(output_size, input_size_coeff)
set_random_seed(seed)
if device == 'cpu':
layers._initialize_affine_weight_cpu(
weight, output_size, input_size, input_size_coeff, 1, torch.nn.init.normal_,
params_dtype=global_vars.get_args().params_dtype)
else:
layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 1)
# Target.
set_random_seed(seed)
master_weight = torch.empty(output_size, input_size)
torch.nn.init.normal_(master_weight)
rank = parallel_state.get_tensor_model_parallel_rank()
my_weight = torch.split(master_weight, input_size_coeff,
dim=1)[rank].contiguous().clone()
# Compare.
error = weight.sub(my_weight).abs().max()
torch.distributed.barrier()
print(' row parallel max error (should be zero) on global rank '
'{}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
class IdentityLayer2D(torch.nn.Module):
def __init__(self, m, n):
super(IdentityLayer2D, self).__init__()
self.weight = Parameter(torch.Tensor(m, n))
torch.nn.init.xavier_normal_(self.weight)
def forward(self):
return self.weight
def test_column_parallel_linear(tensor_model_parallel_size):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing ColumnParallelLinear with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
loss_weight = torch.randn([batch_size, output_size]).cuda()
# Forward
input_ = identity_layer()
output, _ = linear_layer(input_)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
# Values.
dLdY = loss_weight
X = identity_layer.weight
A = linear_layer.master_weight.cuda()
dLdA = torch.matmul(dLdY.t(), X)
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
dLdX = torch.matmul(dLdY, A)
rank = parallel_state.get_tensor_model_parallel_rank()
my_dLdA = torch.split(dLdA, output_size_coeff,
dim=0)[rank].contiguous().clone()
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdA on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
my_dLdb = torch.split(dLdb, output_size_coeff,
dim=0)[rank].contiguous().clone()
error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdb on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdX.sub(identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdX on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_column_parallel_linear_with_async_allreduce_autocast(tensor_model_parallel_size):
autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer3D(batch_size, batch_size, input_size).cuda()
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
assert linear_layer.async_tensor_model_parallel_allreduce or tensor_model_parallel_size == 1
# Forward
for dtype in autocast_dtypes:
loss_weight = torch.randn([batch_size, output_size]).cuda()
with torch.cuda.amp.autocast(dtype=dtype):
output, _ = linear_layer(identity_layer())
loss = torch.mul(output, loss_weight).sum()
assert output.dtype == dtype
# Backward
loss.backward()
torch.distributed.barrier()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_column_parallel_linear_with_async_allreduce_custom_amp(tensor_model_parallel_size):
dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
for dtype in dtypes:
# Network
identity_layer = IdentityLayer3D(batch_size, batch_size, input_size).to(device="cuda", dtype=dtype)
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).to(device="cuda", dtype=dtype)
# Forward
loss_weight = torch.randn([batch_size, output_size]).cuda()
output, _ = linear_layer(identity_layer())
loss = torch.mul(output, loss_weight).sum()
loss.backward()
torch.distributed.barrier()
assert output.dtype == dtype
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_row_parallel_linear(tensor_model_parallel_size):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing RowParallelLinear with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
linear_layer = layers.RowParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
loss_weight = torch.randn([batch_size, output_size]).cuda()
# Forward
input_ = identity_layer()
output, _ = linear_layer(input_)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
# Values.
dLdY = loss_weight
X = identity_layer.weight
A = linear_layer.master_weight.cuda()
dLdA = torch.matmul(dLdY.t(), X)
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
dLdX = torch.matmul(dLdY, A)
rank = parallel_state.get_tensor_model_parallel_rank()
my_dLdA = torch.split(dLdA, input_size_coeff,
dim=1)[rank].contiguous().clone()
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdA on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdb.sub(linear_layer.bias.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdb on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdX.sub(identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdX on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size,
sequence_length):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
num_att_heads = num_att_heads_per_partition * \
torch.distributed.get_world_size()
hidden_size = hidden_size_per_att_head * num_att_heads
# Network
identity_layer = IdentityLayer3D(batch_size, sequence_length,
hidden_size).cuda()
attention_layer = parallel_state.BertParallelSelfAttention(hidden_size, num_att_heads,
dropout_prob).cuda()
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
# Forward
input_ = identity_layer()
output = attention_layer(input_, attention_mask)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
rank = parallel_state.get_tensor_model_parallel_rank()
parallel_state.destroy_model_parallel()
return rank, hidden_size, tensor_model_parallel_size, loss, \
attention_layer, identity_layer
def test_parallel_self_attention(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing ParallelSelfAttention with model parallel '
'size: {}'.format(tensor_model_parallel_size))
num_att_heads_per_partition = 3
hidden_size_per_att_head = 7
dropout_prob = 0.0 # has to be zero
batch_size = 5
sequence_length = 13
rank_1, hideen_size_1, tensor_model_parallel_size_1, loss_1, \
attention_layer_1, identity_layer_1 = parallel_self_attention(
1, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
rank, hidden_size, tensor_model_parallel_size, loss, \
attention_layer, identity_layer = parallel_self_attention(
tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
assert hideen_size_1 == hidden_size
error = loss_1.sub(loss).abs().max()
torch.distributed.barrier()
print(' loss error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
my_lin_grad_list = torch.split(
attention_layer_1.query_key_value.weight.grad,
hidden_size // tensor_model_parallel_size, 0)[rank::tensor_model_parallel_size]
my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
error = my_lin_grad.sub(
attention_layer.query_key_value.weight.grad).abs().max()
torch.distributed.barrier()
print(' weight gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
error = identity_layer_1.weight.grad.sub(
identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' input gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
num_att_heads = num_att_heads_per_partition * \
torch.distributed.get_world_size()
hidden_size = hidden_size_per_att_head * num_att_heads
intermediate_size = 4 * hidden_size
# Network
identity_layer = IdentityLayer3D(batch_size, sequence_length,
hidden_size).cuda()
transformer_layer = parallel_state.BertParallelTransformerLayer(
hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
torch.nn.functional.relu, 1.0e-5).cuda()
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
# Forward
input_ = identity_layer()
output = transformer_layer(input_, attention_mask)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
rank = parallel_state.get_tensor_model_parallel_rank()
parallel_state.destroy_model_parallel()
return rank, hidden_size, tensor_model_parallel_size, loss, \
transformer_layer, identity_layer
def test_parallel_transformer_layer(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing ParallelTransformerLayer with model parallel '
'size: {}'.format(tensor_model_parallel_size))
num_att_heads_per_partition = 3
hidden_size_per_att_head = 7
batch_size = 5
sequence_length = 13
rank_1, hidden_size_1, tensor_model_parallel_size_1, loss_1, \
transformer_layer_1, identity_layer_1 = parallel_transformer(
1, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length)
rank, hidden_size, tensor_model_parallel_size, loss, \
transformer_layer, identity_layer = parallel_transformer(
tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length)
error = loss_1.sub(loss).abs().max()
torch.distributed.barrier()
print(' loss error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-5, 'error: {}'.format(error)
error = identity_layer_1.weight.grad.sub(
identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' input gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-5, 'error: {}'.format(error)
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
exceptions = []
print_separator('test initialize affine weight cpu')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_initialize_affine_weight(tensor_model_parallel_size, 'cpu')
except Exception as e:
exceptions.append(f"test_initialize_affine_weight-cpu with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
# Reset groups
parallel_state.destroy_model_parallel()
print_separator('test initialize affine weight gpu')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_initialize_affine_weight(tensor_model_parallel_size, 'gpu')
except Exception as e:
exceptions.append(f"test_initialize_affine_weight-gpu with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
# Deleted, replaced with vocab parallel embedding?
#tensor_model_parallel_size = 1
#while tensor_model_parallel_size <= world_size:
# print_separator('test parallel embedding')
# test_parallel_embedding(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
print_separator('test column-parallel linear')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator('test row-parallel linear')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_row_parallel_linear(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_row_parallel_linear with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator("test ColumnParallelLinearWithAsyncAllreduce - autocast")
tensor_model_parallel_size = 2
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear_with_async_allreduce_autocast(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear_with_async_allreduce_autocast with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator("test ColumnParallelLinearWithAsyncAllreduce - custom AMP")
tensor_model_parallel_size = 2
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear_with_async_allreduce_custom_amp(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear_with_async_allreduce_custom_amp with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
if exceptions:
raise RuntimeError("\n".join(exceptions))
# Deleted
#print_separator('test parallel self-attention')
#tensor_model_parallel_size = 1
#while tensor_model_parallel_size <= world_size:
# test_parallel_self_attention(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
#Deleted because PararallelTransformerLayer no longer exists
# print_separator('test parallel transformer')
# tensor_model_parallel_size = 1
# while tensor_model_parallel_size <= world_size:
# test_parallel_transformer_layer(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
import torch
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import mappings
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import initialize_distributed
global_vars.set_global_variables()
def test__reduce(args, tensor_model_parallel_size):
print("Testing reduction size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
assert torch.equal(
mappings._reduce(torch.full((10, 10, 10, 10), (50))),
torch.full((10, 10, 10, 10), 50 * tensor_model_parallel_size),
)
parallel_state.destroy_model_parallel()
print("Passed!")
def test__split(args, tensor_model_parallel_size):
print("Testing splitting size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
listy = []
for i in range(tensor_model_parallel_size):
listy.append(torch.randn(10, 1))
x = torch.cat(tuple(listy), 1)
out = mappings._split(x)
assert torch.equal(out, listy[parallel_state.get_tensor_model_parallel_rank()])
parallel_state.destroy_model_parallel()
print("Passed!")
def test__gather(args, tensor_model_parallel_size):
print("Testing gathering size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
assert torch.equal(
mappings._gather(torch.tensor([parallel_state.get_tensor_model_parallel_rank()])),
torch.tensor(list(range(tensor_model_parallel_size))),
)
parallel_state.destroy_model_parallel()
print("Passed!")
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
test__reduce(args, tensor_model_parallel_size)
test__split(args, tensor_model_parallel_size)
test__gather(args, tensor_model_parallel_size)
tensor_model_parallel_size *= 2
print(">> passed the test :-)")
from functools import partial
import logging
from typing import List
import torch
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import get_ltor_masks_and_position_ids
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.tensor_parallel import model_parallel_cuda_manual_seed
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.standalone_gpt import gpt_model_provider
from apex.transformer.log_util import get_transformer_logger, set_logging_level
set_logging_level(logging.NOTSET)
_logger = get_transformer_logger("megatron_gpt_pipeline_test")
global_vars.set_global_variables()
N_VOCAB = 8192
def generate_batch(batch_size, sequence_length):
size = batch_size, sequence_length + 1
int_tensor = torch.randint(low=0, high=N_VOCAB, size=size, dtype=torch.long).cuda()
return int_tensor,
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L44
def get_batch(int_tensors: List[torch.Tensor]):
data = int_tensors[0]
# Unpack.
tokens_ = data.long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and position ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
N_VOCAB, # tokenizer.eod,
False, # args.reset_position_ids,
False, # args.reset_attention_mask,
False, # args.eod_mask_loss,
)
return tokens, labels, loss_mask, attention_mask, position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
# TODO (mkozuki): Currently I'm seeing no attribute `word_embeddings` which looks weird.
def forward_step(batch, model):
"""Forward step."""
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(batch)
output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def run_gpt(pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=None, forward_only=False):
parallel_state.initialize_model_parallel(1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
model_parallel_cuda_manual_seed(42)
model = build_model(
gpt_model_provider, True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size)
_logger.debug("building model")
assert isinstance(model, list)
assert len(model) == (1 or virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups)
if parallel_state.is_pipeline_last_stage():
_logger.debug("checking `word_embeddings` existence")
for m in model:
assert hasattr(m, "word_embeddings")
args = global_vars.get_args()
if virtual_pipeline_model_parallel_size is None:
batch = generate_batch(args.global_batch_size, args.seq_length)
else:
batch = [generate_batch(args.global_batch_size, args.seq_length) for _ in range(virtual_pipeline_model_parallel_size)]
_logger.debug("preparing batch")
if virtual_pipeline_model_parallel_size is None:
fwd_bwd_func = forward_backward_pipelining_without_interleaving
else:
fwd_bwd_func = _forward_backward_pipelining_with_interleaving
_logger.debug(f"selecting forward_backward func: {fwd_bwd_func}")
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
_logger.debug(f"`tensor_shape`: {tensor_shape}")
fwd_bwd_func(forward_step, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
_logger.debug(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
initialize_distributed()
args = global_vars.get_args()
args.padded_vocab_size = N_VOCAB
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
update_num_microbatches(0, True)
print_separator("run GPT model")
try:
run_gpt(torch.distributed.get_world_size())
# TODO(mkozuki): handle exception correctly, but for now, lazily commenting out as
# this won't get kicked by CI
except Exception as e:
_logger.debug(str(e))
pass
finally:
parallel_state.destroy_model_parallel()
from typing import Optional, Union, List
import torch
import torch.nn as nn
import apex
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import forward_backward_no_pipelining
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.log_util import get_transformer_logger, set_logging_level
# set_logging_level("INFO")
_logger = get_transformer_logger("pipeline_parallel_test")
global_vars.set_global_variables()
batch_size, micro_batch_size = None, None
hidden_size = 16
fwd_bwd_functions = {
"no_pipelining": forward_backward_no_pipelining,
"no_interleaving": forward_backward_pipelining_without_interleaving,
"interleaving": _forward_backward_pipelining_with_interleaving,
}
# note (mkozuki): `pre_process` and `post_process` are a placeholder until interleaving schedule test comes.
class MyLayer(nn.Module):
def __init__(self, pre_process: bool, post_process: bool):
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = nn.Linear(hidden_size, hidden_size)
def forward(self, x):
return self.layer(x)
class MyModel(nn.Module):
def __init__(self, pre_process: bool = False, post_process: bool = False) -> None:
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = MyLayer(pre_process=pre_process, post_process=post_process)
self.input_tensor = None
def set_input_tensor(self, input_tensor: Union[torch.Tensor, List[torch.Tensor]]) -> None:
self.input_tensor = input_tensor
def forward(self, x: Optional[torch.Tensor]) -> torch.Tensor:
if self.input_tensor is None:
return self.layer(x)
return self.layer(self.input_tensor)
def model_provider_func(pre_process, post_process) -> MyModel:
return MyModel(pre_process, post_process)
def process_batch(batch):
if isinstance(batch, list):
x = batch[0]
else:
x = batch
return x
def fwd_step_func(batch, model):
x = process_batch(batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'avg': averaged_loss}
return y, loss_func
# TODO (mkozuki): Add a case with `autocast` and `GradScaler`.
# Run forward & backward for one minibatch.
def forward_backward_func_template(
name: str,
forward_backward_func,
pipeline_model_parallel_size: int,
forward_only: bool,
) -> None:
print_separator(f"name: {name}, pipeline model parallel size: {pipeline_model_parallel_size}")
virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None
if name == "no_pipelining":
# note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with
# pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as
# tensor_model_parallel_size and set pipeline_model_parallel_size to 1.
parallel_state.initialize_model_parallel(1, 1, None)
else:
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is necessary to enable interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
if virtual_pipeline_model_parallel_size is not None:
# Check the experimental warning message
get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
model = build_model(
model_provider_func,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
assert isinstance(model, list)
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups, lr=1e-4)
tensor_shape = [batch_size // parallel_state.get_data_parallel_world_size(), hidden_size]
batch = (torch.randn(tensor_shape).cuda(),)
tensor_shape[0] = micro_batch_size
update_num_microbatches(0)
forward_backward_func(
fwd_step_func, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
if not forward_only:
for m in model:
for p in m.parameters():
if p.grad is None:
raise RuntimeError("grad not found")
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
n_tests = 0
failures = []
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
for forward_only in (True, False):
for name, forward_backward_func in fwd_bwd_functions.items():
n_tests += 1
# TODO (mkozuki): Test with data parallel size > 1.
pipeline_model_parallel_size = world_size
try:
forward_backward_func_template(
name,
forward_backward_func,
pipeline_model_parallel_size,
forward_only,
)
except Exception as e:
failures.append(
f"\t# {name} failed with pipeline size: {pipeline_model_parallel_size} "
f"and forward_only: {forward_only}\n"
f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, "
f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n"
f"{str(e)}"
)
finally:
parallel_state.destroy_model_parallel()
else:
print_separator(f"{name} works")
print_separator("TEST RESULT")
if failures:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("\n".join(failures))
msg = f"{len(failures)} / {n_tests} cases failed"
raise RuntimeError(msg)
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("### PASS!")
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex.transformer import parallel_state
from apex.transformer import tensor_parallel
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_set_cuda_rng_state(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing set_rng_state with size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
size = 123
seed = 1234
torch.cuda.manual_seed(seed)
tensor = torch.cuda.FloatTensor(size)
# Get the state
rng_state = torch.cuda.get_rng_state()
rng_state_copy = rng_state.clone()
# Do some stuff.
for _ in range(5):
torch.randn(size, out=tensor)
result_1 = tensor.clone()
assert rng_state.sub(rng_state_copy).max() == 0
assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
# State should be different.
new_rng_state = torch.cuda.get_rng_state()
max_diff = new_rng_state.sub(rng_state).max()
print(' max diff in rng state (should be non-zero) on global rank {}: {}'.
format(torch.distributed.get_rank(), max_diff))
assert max_diff > 0
# Reset the rng state and do the same stuff.
tensor_parallel.random._set_cuda_rng_state(rng_state)
for _ in range(5):
torch.randn(size, out=tensor)
tensor_parallel.random._set_cuda_rng_state(rng_state)
for _ in range(5):
torch.randn(size, out=tensor)
result_2 = tensor.clone()
# Results should be the same
error = result_2.sub(result_1).abs().max()
print(' max error in generated tensors (should be zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Input state should have remained intact.
error = rng_state.sub(rng_state_copy).max()
print(' max error in rng state (should be zero) on global rank {}: {}'.
format(torch.distributed.get_rank(), error))
assert error == 0
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_cuda_rng_tracker(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing cuda rng tracker with size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed_1 = 1234
seed_2 = 4321
size = [12, 21]
tensor = torch.cuda.FloatTensor(size)
# Set to seed_1 and generate two tensors.
torch.cuda.manual_seed(seed_1)
torch.randn(size, out=tensor)
target_11 = tensor.clone()
torch.randn(size, out=tensor)
target_12 = tensor.clone()
# Set to seed_2 and generate two tensors.
torch.cuda.manual_seed(seed_2)
torch.randn(size, out=tensor)
target_21 = tensor.clone()
torch.randn(size, out=tensor)
target_22 = tensor.clone()
# Now if we interleave seed_1 and seed_2,
# we should still get the same tensors
torch.cuda.manual_seed(seed_1)
tensor_parallel.random.get_cuda_rng_tracker().add('test', seed_2)
torch.randn(size, out=tensor)
result_11 = tensor.clone()
with tensor_parallel.random.get_cuda_rng_tracker().fork('test'):
torch.randn(size, out=tensor)
result_21 = tensor.clone()
torch.randn(size, out=tensor)
result_12 = tensor.clone()
with tensor_parallel.random.get_cuda_rng_tracker().fork('test'):
torch.randn(size, out=tensor)
result_22 = tensor.clone()
diff = result_11.sub(result_21).abs().max()
diff = min(diff, result_12.sub(result_22).abs().max())
print(' max diff in generated tensors (should be non-zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
assert diff > 1.0e-6
error = max(result_11.sub(target_11).abs().max(),
result_12.sub(target_12).abs().max())
error = max(error, result_21.sub(target_21).abs().max())
error = max(error, result_22.sub(target_22).abs().max())
print(' max error in generated tensors (should be zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset the tracker
tensor_parallel.random.get_cuda_rng_tracker().reset()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print(
'> testing model parallel cuda manual seed with size {} ...'.format(
tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
tensor_parallel.random.model_parallel_cuda_manual_seed(12345)
assert torch.cuda.initial_seed() == 12345
with tensor_parallel.random.get_cuda_rng_tracker().fork():
assert (
torch.cuda.initial_seed() ==
12345 + 2718 + parallel_state.get_tensor_model_parallel_rank()
)
# Reset the tracker
tensor_parallel.random.get_cuda_rng_tracker().reset()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test set rng state')
test_set_cuda_rng_state(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test cuda rng tracker')
test_cuda_rng_tracker(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test model parallel cuda manual seed')
test_model_parallel_cuda_manual_seed(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment