Unverified Commit 96850dfa authored by Jithun Nair's avatar Jithun Nair Committed by GitHub
Browse files

Merge pull request #80 from ROCmSoftwarePlatform/IFU-master-2022-07-29

IFU-master-2022-07-29
parents 87fc4125 cc5f83b5
import argparse
import random
import sys
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from apex import amp
from apex.optimizers import FusedAdam
from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
class TestModel(torch.nn.Module):
def __init__(self, args):
super(TestModel, self).__init__()
self.linear = torch.nn.Sequential(*[torch.nn.Linear(args.dim, args.dim, bias=args.bias) for _ in range(args.layers)])
def forward(self, x):
return self.linear(x)
def setup(args):
## Model
ref_model = TestModel(args).cuda()
dist_model = TestModel(args).cuda()
# Same weights
with torch.no_grad():
for dp, rp in zip(dist_model.parameters(), ref_model.parameters()):
dp.data.copy_(rp.data)
dist_model = dist_model.half()
## Optimizer
# same hyperparameters
ref_opt_args = { 'lr': 1e-3, 'eps': 1e-6, 'weight_decay': 0.01 }
ref_opt = FusedAdam(ref_model.parameters(), **ref_opt_args)
dist_opt_args = ref_opt_args.copy()
dist_opt_args.update( {'overlap_reductions' : False} )
dist_opt_args.update( {'process_group_size' : args.n_gpu} )
dist_opt_args.update( {'dwu_group_size' : args.dwu_group_size} )
dist_opt_args.update( {'dwu_num_blocks' : 1} )
dist_opt_args.update( {'dwu_num_chunks' : 1} )
dist_opt = DistributedFusedAdam(dist_model.parameters(), **dist_opt_args)
dist_opt.set_global_scale(1.)
## amp-init
amp_args = { 'loss_scale' : 'dynamic' , 'opt_level' : 'O2'}
ref_model, ref_opt = amp.initialize(ref_model, ref_opt, **amp_args)
## DDP
ref_model = DDP(ref_model, device_ids=[args.rank])
with torch.no_grad():
for dp in dist_model.parameters():
torch.distributed.broadcast(dp.data, src=0)
for rp in ref_model.parameters():
torch.distributed.broadcast(rp.data, src=0)
torch.cuda.synchronize()
torch.distributed.barrier()
if get_rank() == 0:
print(f'dist opt with {args.n_gpu} GPUs')
return ref_model, ref_opt, dist_model, dist_opt
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=-1)
parser.add_argument('--steps', type=int, default=20)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--dim', type=int, default=4)
parser.add_argument('--layers', type=int, default=2)
parser.add_argument('--bias', action='store_true')
parser.add_argument('--atol', type=float, default=1e-3)
parser.add_argument('--rtol', type=float, default=1)
parser.add_argument('--dwu_group_size', type=float, default=1)
args = parser.parse_args()
return args
def setup_env(args):
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.rank = torch.distributed.get_rank()
args.n_gpu = torch.distributed.get_world_size()
seed = 42 + get_rank()
random.seed(seed)
torch.manual_seed(seed)
return args
def get_rank():
return torch.distributed.get_rank()
def main():
args = parse_args()
args = setup_env(args)
tol_args = { 'atol' : args.atol, 'rtol' : args.rtol }
torch.set_printoptions(precision=16)
ref_model, ref_opt, dist_model, dist_opt = setup(args)
# lazy_init not called yet, initialize stash
stash = ref_opt._amp_stash
stash.all_fp16_params, stash.all_fp32_from_fp16_params = [], []
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt._init_everything()
for i in range(args.steps):
x_ref = torch.randn(args.batch, args.dim, dtype=torch.half).cuda().requires_grad_(True)
x_dist = x_ref.clone().detach().requires_grad_(True)
if get_rank() == 0:
print(f'[{i}] Checking input')
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert(torch.allclose(x_ref, x_dist, **tol_args))
y_ref = ref_model(x_ref).half()
y_dist = dist_model(x_dist)
if get_rank() == 0:
print(f'[{i}] Checking output')
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert(torch.allclose(y_ref, y_dist, **tol_args))
dy = torch.randn_like(y_ref)
y_ref.backward(dy)
y_dist.backward(dy)
if get_rank() == 0:
print(f'[{i}] Checking gradients')
torch.distributed.barrier()
torch.cuda.synchronize()
assert(torch.allclose(x_ref.grad, x_dist.grad, **tol_args))
# gradient all-reduce within distributed optimizer
dist_opt.complete_reductions()
if get_rank() == 0:
print(f'[{i}] Stepping')
ref_opt.step()
dist_opt.step()
torch.cuda.synchronize()
torch.distributed.barrier()
print('Checking new weights')
if get_rank() == 0:
print("ref param:", ref_model.module.linear[0].weight)
print("dist param:", dist_model.linear[0].weight)
for i, (rp, dp) in enumerate(zip(ref_model.parameters(), dist_model.parameters())):
if not torch.allclose(rp, dp, **tol_args):
if get_rank() == 0:
print(f'Rank: {get_rank()}, Param: {i}')
print(f'ref: {rp.sum().item()}, dist: {dp.sum().item()}')
print(rp)
print(dp)
print(torch.abs(rp-dp) > tol_args['atol'])
sys.exit(0)
# zero grads
for rp, dp in zip(ref_model.parameters(), dist_model.parameters()):
rp.grad = None
dp.grad = None
if __name__ == "__main__":
main()
import unittest
import os
from itertools import product
import random
import unittest
import math
import torch
import apex
from itertools import product
from torch.optim import Optimizer
class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(9876)
torch.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, options, tst_options=None):
# Adding this to make backward compatible with existing tests. Just in
# case "tst_options" are not provided, it gets a copy of options
# which contains the parameters for the reference optimizer
if tst_options == None:
tst_options = options
ref_param = []
tst_param = []
for tensor in tensors:
......@@ -60,11 +59,11 @@ class TestFusedOptimizer(unittest.TestCase):
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, device='cuda'):
def gen_single_type_test(self, param_type=torch.float, device='cuda', *, skip_assert: bool = False):
nelem = 278011
# Some ref and test optimizers may require different set of options.
# This is a quick workaround to add that functionality while making
# This is a quick workaround to add that functionality while making
# minimum changes in existing code.
# If there is no "tst_options" field provided, safe to initialize
# the test optimizer with the parameters of reference optimizer.
......@@ -80,6 +79,8 @@ class TestFusedOptimizer(unittest.TestCase):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
if skip_assert:
return
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
......@@ -87,8 +88,8 @@ class TestFusedOptimizer(unittest.TestCase):
class TestFusedAdam(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedAdam, self).__init__(*args, **kwargs)
def setUp(self):
super().setUp()
self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay': 0, 'amsgrad': False}
self.ref_optim = torch.optim.Adam
......@@ -98,9 +99,15 @@ class TestFusedAdam(TestFusedOptimizer):
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
# NOTE(mkozuki): Current threshold values look too small for BFloat16.
# TODO(mkozuki): Refactor `TestFusedOptimizer`
@unittest.skip("NaN issue observed on ROCm as of 12/1/2021. The failing unit test is introduced by a PyTorch commit sometime in between rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.9.0 and 2021/12/01. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/63")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
self.gen_single_type_test(param_type=torch.float16, skip_assert=True)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
def test_bfloat16(self):
self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
......
import inspect
import unittest
from apex.pyprof.prof.data import Data
from apex.pyprof.prof.prof import foo
class TestPyProfData(unittest.TestCase):
def __init__(self, testName):
super().__init__(testName)
def setUp(self):
pass
def tearDown(self):
pass
def test_data(self):
kernels = [
{'kShortName': 'elementwise_kernel', 'kDuration': 2848, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'float', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 104, 160), 'dtype': 'bool'}]}"], 'seqMarker': ['to, seq = 60471'], 'seqId': [60471], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['float'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (585, 1, 1), 'block': (512, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<512, 1, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1})'},
{'kShortName': 'elementwise_kernel', 'kDuration': 201182, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'clone', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 4, 416, 640), 'dtype': 'float32'}]}"], 'seqMarker': ['clone, seq = 60161'], 'seqId': [60161], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['clone'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (37440, 1, 1), 'block': (128, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<128, 4, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2})'},
]
for k in kernels:
d = Data(k)
mod = k['mod']
op = k['op']
xx = foo(mod, op, d)
d.setParams(xx.params())
def run_tests(test_name):
dummy = TestPyProfData(test_name)
test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
print(f'Running tests for {test_name}')
suite = unittest.TestSuite()
for test_case in test_cases:
suite.addTest(TestPyProfData(test_case))
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
run_tests('test_data')
import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
import inspect
import os
import torch
import torch.nn.functional as F
import unittest
from apex import pyprof
pyprof.nvtx.init()
# TODO: add tests for:
# F.bilinear, F.l1_loss, F.multilabel_soft_margin_loss, F.multi_margin_loss
class TestPyProfNvtx(unittest.TestCase):
def __init__(self, testName, dtype=torch.float16):
super().__init__(testName)
self.dtype = dtype
def setUp(self):
pass
def tearDown(self):
pass
def test_conv1d(self):
# Data and weight tensors
tensor1d_in_conv = torch.randn(32, 3, 224, device='cuda', dtype=self.dtype)
tensor1d_in_conv_grouped = torch.randn(32, 6, 224, device='cuda', dtype=self.dtype)
conv1d_filter = torch.randn(16, 3, 3, device='cuda', dtype=self.dtype)
conv1d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv1d
conv1d_out_vanilla = F.conv1d(tensor1d_in_conv, conv1d_filter)
# conv1d with bias
conv1d_out_with_bias = F.conv1d(tensor1d_in_conv, conv1d_filter, bias=conv1d_bias)
# conv1d - stride > 1
conv1d_out_strided = F.conv1d(tensor1d_in_conv, conv1d_filter, stride=2)
# conv1d - dilation > 1
conv1d_out_dilated = F.conv1d(tensor1d_in_conv, conv1d_filter, dilation=2)
# conv1d - groups > 1
conv1d_out_grouped = F.conv1d(tensor1d_in_conv_grouped, conv1d_filter, groups=2)
# conv1d - padding with zeros
conv1d_out_padding_zeros = F.conv1d(tensor1d_in_conv, conv1d_filter, padding=6)
def test_conv2d(self):
# Data and weight tensors
tensor2d_in_conv = torch.randn(32, 3, 224, 224, device='cuda', dtype=self.dtype)
tensor2d_in_conv_grouped = torch.randn(32, 6, 224, 224, device='cuda', dtype=self.dtype)
conv2d_filter = torch.randn(16, 3, 3, 3, device='cuda', dtype=self.dtype)
conv2d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv2d
conv2d_out_vanilla = F.conv2d(tensor2d_in_conv, conv2d_filter)
# conv2d with bias
conv2d_with_bias = F.conv2d(tensor2d_in_conv, conv2d_filter, bias=conv2d_bias)
# conv2d - stride > 1
conv2d_out_strided = F.conv2d(tensor2d_in_conv, conv2d_filter, stride=2)
# conv2d - dilation > 1
conv2d_out_dilated = F.conv2d(tensor2d_in_conv, conv2d_filter, dilation=2)
# conv2d - groups > 1
conv2d_out_grouped = F.conv2d(tensor2d_in_conv_grouped, conv2d_filter, groups=2)
# conv2d - padding with zeros
conv2d_out_padding_zeros = F.conv2d(tensor2d_in_conv, conv2d_filter, padding=6)
def test_conv3d(self):
# Data and weight tensors
tensor3d_in_conv = torch.randn(32, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
tensor3d_in_conv_grouped = torch.randn(32, 6, 16, 224, 224, device='cuda', dtype=self.dtype)
conv3d_filter = torch.randn(16, 3, 3, 3, 3, device='cuda', dtype=self.dtype)
conv3d_bias = torch.ones(16, device='cuda', dtype=self.dtype)
# Vanilla conv3d
conv3d_out_vanilla = F.conv3d(tensor3d_in_conv, conv3d_filter)
# conv3d - stride > 1
conv3d_out_strided = F.conv3d(tensor3d_in_conv, conv3d_filter, stride=2)
# conv3d - dilation > 1
conv3d_out_dilated = F.conv3d(tensor3d_in_conv, conv3d_filter, dilation=2)
# conv3d - groups > 1
conv3d_out_grouped = F.conv3d(tensor3d_in_conv_grouped, conv3d_filter, groups=2)
# conv3d - padding with zeros
conv3d_out_padding_zeros = F.conv3d(tensor3d_in_conv, conv3d_filter, padding=6)
def test_conv_transpose1d(self):
# Data and weight tensors
conv_transpose1d_tensor = torch.randn(64, 16, 64, device='cuda', dtype=self.dtype)
conv_transpose1d_filter = torch.randn(16, 32, 3, device='cuda', dtype=self.dtype)
conv_transpose1d_bias = torch.randn(32, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose1d_out = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter)
conv_transpose1d_out_biased = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, bias=conv_transpose1d_bias)
conv_transpose1d_out_strided = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, stride=2)
conv_transpose1d_out_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, padding=3)
conv_transpose1d_out2_padded = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, output_padding=2, dilation=3)
conv_transpose1d_out_grouped = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, groups=2)
conv_transpose1d_out_dilated = F.conv_transpose1d(conv_transpose1d_tensor, conv_transpose1d_filter, dilation=2)
def test_conv_transpose2d(self):
# Data and weight tensors
conv_transpose2d_tensor = torch.randn(64, 8, 5, 5, device='cuda', dtype=self.dtype)
conv_transpose2d_filter = torch.randn(8, 16, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose2d_bias = torch.randn(16, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose2d_out = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter)
conv_transpose2d_out_biased = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, bias=conv_transpose2d_bias)
conv_transpose2d_out_strided = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, stride=2)
conv_transpose2d_out_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, padding=3)
conv_transpose2d_out2_padded = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, output_padding=2, dilation=3)
conv_transpose2d_out_grouped = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, groups=2)
conv_transpose2d_out_dilated = F.conv_transpose2d(conv_transpose2d_tensor, conv_transpose2d_filter, dilation=2)
def test_conv_transpose3d(self):
# Data and weight tensors
conv_transpose3d_tensor = torch.randn(20, 16, 50, 10, 20, device='cuda', dtype=self.dtype)
conv_transpose3d_filter = torch.randn(16, 33, 3, 3, 3, device='cuda', dtype=self.dtype)
conv_transpose3d_bias = torch.randn(33, device='cuda', dtype=self.dtype)
# Conv transpose runs
conv_transpose3d_out = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter)
conv_transpose3d_out_biased = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, bias=conv_transpose3d_bias)
conv_transpose3d_out_strided = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, stride=2)
conv_transpose3d_out_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, padding=3)
conv_transpose3d_out2_padded = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, output_padding=2, dilation=3)
conv_transpose3d_out_grouped = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, groups=2)
conv_transpose3d_out_dilated = F.conv_transpose3d(conv_transpose3d_tensor, conv_transpose3d_filter, dilation=2)
def test_unfold(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
kernel_size = (4, 5)
inp_unf_dilated = F.unfold(inp, kernel_size, dilation=2)
inp_unf_padded = F.unfold(inp, kernel_size, padding=2)
inp_unf_strided = F.unfold(inp, kernel_size, stride=2)
def test_fold(self):
inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype)
inp_folded = F.fold(inp, (4, 5), (1, 1))
def test_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.avg_pool1d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool2d(self):
inp = torch.randn(1, 3, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool2d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_avg_pool3d(self):
inp = torch.randn(1, 3, 16, 224, 224, device='cuda', dtype=self.dtype)
out = F.avg_pool3d(inp, kernel_size=5, stride=2, padding=2, ceil_mode=True, count_include_pad=False)
def test_adaptive_avg_pool1d(self):
inp = torch.randn(1, 1, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool1d(inp, output_size=5)
def test_adaptive_avg_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool2d(inp, output_size=5)
def test_adaptive_avg_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_avg_pool3d(inp, output_size=5)
def test_max_pool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
out = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
def test_adaptive_max_pool1d(self):
inp = torch.randn(1, 16, 28, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool1d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool2d(inp, output_size=5, return_indices=True)
def test_adaptive_max_pool3d(self):
inp = torch.randn(1, 16, 16, 32, 32, device='cuda', dtype=self.dtype)
out = F.adaptive_max_pool3d(inp, output_size=5, return_indices=True)
def test_max_unpool1d(self):
inp = torch.randn(1, 16, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool1d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool1d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool2d(self):
inp = torch.randn(1, 16, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool2d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool2d(output, indices, kernel_size=2, stride=2, padding=2)
def test_max_unpool3d(self):
inp = torch.randn(1, 16, 8, 32, 32, device='cuda', dtype=self.dtype)
output, indices = F.max_pool3d(inp, kernel_size=5, stride=2, padding=2, return_indices=True, ceil_mode=True)
output = F.max_unpool3d(output, indices, kernel_size=2, stride=2, padding=2)
def test_lp_pool1d(self):
inp = torch.randn(1, 32, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool1d(inp, 2, 3, stride=2, ceil_mode=True)
def test_lp_pool2d(self):
#torch.nn.LPPool2d(norm_type, kernel_size, stride=None, ceil_mode=False)
inp = torch.randn(1, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.lp_pool2d(inp, 2, 3, stride=2, ceil_mode=True)
def test_threshold(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold(inp, 6, 6, inplace=False)
def test_threshold_(self):
inp = torch.randn(1, 8, 32, 32, device='cuda', dtype=self.dtype)
output = F.threshold_(inp, 6, 6)
def test_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu(inp, inplace=False)
def test_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu_(inp)
def test_hardtanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh(inp, min_val=-1., max_val=1., inplace=False)
def test_hardtanh_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardtanh_(inp, min_val=-1., max_val=1.)
def test_relu6(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.relu6(inp, inplace=False)
def test_elu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu(inp, alpha=1.0, inplace=False)
def test_elu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.elu_(inp, alpha=1.0)
def test_selu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.selu(inp)
def test_celu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.celu(inp, alpha=1.0, inplace=False)
def test_leaky_relu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu(inp, negative_slope=0.01, inplace=False)
def test_leaky_relu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.leaky_relu_(inp, negative_slope=0.01)
def test_prelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
weight = torch.randn(1, device='cuda', dtype=self.dtype)
output = F.prelu(inp, weight)
def test_rrelu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False, inplace=False)
def test_rrelu_(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.rrelu(inp, lower=1./8, upper=1./3, training=False)
def test_glu(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.glu(inp, dim=-1)
def test_logsigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.logsigmoid(inp)
def test_hardshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.hardshrink(inp, lambd=0.5)
def test_tanhshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.tanhshrink(inp)
def test_softsign(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softsign(inp)
def test_softplus(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softplus(inp, beta=1, threshold=20)
def test_softmin(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmin(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.softmax(inp, dim=1, _stacklevel=3, dtype=self.dtype)
def test_softshrink(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.softshrink(inp, lambd=0.5)
def test_gumbel_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.gumbel_softmax(inp, tau=1, hard=False, eps=1e-10, dim=-1)
def test_log_softmax(self):
inp = torch.randn(16, 1024, device='cuda', dtype=self.dtype)
output = F.log_softmax(inp, dim=-1, _stacklevel=3)
def test_tanh(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.tanh(inp)
def test_sigmoid(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = torch.sigmoid(inp)
def test_batch_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
# running_mean, running_var
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.batch_norm(inp, running_mean, running_var, weight=None, bias=None, training=False, momentum=0.1, eps=1e-05)
def test_instance_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
running_mean = torch.randn(3, device='cuda', dtype=self.dtype)
running_var = torch.randn(3, device='cuda', dtype=self.dtype)
output = F.instance_norm(inp, running_mean=running_mean, running_var=running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-05)
def test_layer_norm(self):
inp = torch.randn(1, 3, 32, 32, device='cuda', dtype=self.dtype)
output = F.layer_norm(inp, inp.size()[1:], weight=None, bias=None, eps=1e-05)
def test_local_response_norm(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.local_response_norm(inp, 2, alpha=0.0001, beta=0.75, k=1.0)
def test_normalize(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.normalize(inp, p=2, dim=1, eps=1e-12, out=None)
def test_linear(self):
inp = torch.randn(32, 64, 128, device='cuda', dtype=self.dtype)
weight = torch.randn(256, 128, device='cuda', dtype=self.dtype)
output = F.linear(inp, weight, bias=None)
def test_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout(inp, p=0.5, training=True, inplace=False)
def test_alpha_dropout(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.alpha_dropout(inp, p=0.5, training=True, inplace=False)
def test_dropout2d(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout2d(inp, p=0.5, training=True, inplace=False)
def test_dropout3d(self):
inp = torch.randn(16, 8, 32, 64, 64, device='cuda', dtype=self.dtype)
output = F.dropout3d(inp, p=0.5, training=True, inplace=False)
def test_embedding(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding(inp, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False)
def test_embedding_bag(self):
pre_embed_dim = 1024
post_embed_dim = 32
inp = torch.randint(0, pre_embed_dim, (128, 16), device='cuda')
weight = torch.randn(pre_embed_dim, post_embed_dim, device='cuda', dtype=self.dtype)
output = F.embedding_bag(inp, weight, offsets=None, max_norm=None, norm_type=2,
scale_grad_by_freq=False, mode='mean', sparse=False)
def test_one_hot(self):
num_classes = 10
inp = torch.randint(0, num_classes, (128, 16), device='cuda')
output = F.one_hot(inp, num_classes=10)
def test_pairwise_distance(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.pairwise_distance(inp1, inp2, p=2.0, eps=1e-06, keepdim=False)
def test_cosine_similarity(self):
inp1 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
inp2 = torch.randn(1024, 128, device='cuda', dtype=self.dtype)
output = F.cosine_similarity(inp1, inp2, dim=1, eps=1e-8)
def test_pdist(self):
# pdist is not implemented for fp16
inp = torch.randn(128, 128, device='cuda', dtype=torch.float32)
output = F.pdist(inp, p=2)
def test_binary_cross_entropy(self):
# binary_cross_entropy is not implemented for fp16
inp = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=torch.float32, requires_grad=False)
output = F.binary_cross_entropy(torch.sigmoid(inp), target)
def test_binary_cross_entropy_with_logits(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.empty_like(inp).random_(2)
output = F.binary_cross_entropy_with_logits(inp, target)
def test_poisson_nll_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.poisson_nll_loss(inp, target, log_input=True, full=False,
size_average=None, eps=1e-08, reduce=None, reduction='mean')
def test_cosine_embedding_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.cosine_embedding_loss(inp1, inp2, target, margin=0,
size_average=None, reduce=None, reduction='mean')
def test_cross_entropy(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 100, (32,), device='cuda', dtype=torch.long, requires_grad=False)
output = F.cross_entropy(inp, target, weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
def test_ctc_loss(self):
# force fp32 because _th_normal_ (used by next line is not supported for fp16)
log_probs = torch.randn(50, 16, 20, device='cuda', dtype=torch.float32).log_softmax(2).detach().requires_grad_()
targets = torch.randint(1, 20, (16, 30), device='cuda', dtype=torch.long)
input_lengths = torch.full((16,), 50, dtype=torch.long)
target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
def test_hinge_embedding_loss(self):
inp = torch.randn(128, 32, device='cuda', dtype=self.dtype)
target = torch.randint(0, 1, (32,), device='cuda') - 1
output = F.hinge_embedding_loss(inp, target, margin=1.0, size_average=None, reduce=None, reduction='mean')
def test_kl_div(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.kl_div(inp, target, size_average=None, reduce=None, reduction='batchmean')
def test_mse_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.mse_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_margin_ranking_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = (torch.randint(0, 1, (128,), device='cuda') - 1).type_as(inp1)
output = F.margin_ranking_loss(inp1, inp2, target, margin=0, size_average=None, reduce=None, reduction='mean')
def test_multilabel_margin_loss(self):
inp = torch.randn(1024, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (1024,), dtype=torch.long, device='cuda')
output = F.multilabel_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_nll_loss(self):
inp = torch.randn(64, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randint(0, 10, (64,), device='cuda', dtype=torch.long)
output = F.nll_loss(inp, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='mean')
def test_smooth_l1_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.smooth_l1_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_soft_margin_loss(self):
inp = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
target = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=False)
output = F.soft_margin_loss(inp, target, size_average=None, reduce=None, reduction='mean')
def test_triplet_margin_loss(self):
inp1 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp2 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
inp3 = torch.randn(32, 128, device='cuda', dtype=self.dtype, requires_grad=True)
output = F.triplet_margin_loss(inp1, inp2, inp3, margin=1.0, p=2,
eps=1e-06, swap=False, size_average=None, reduce=None, reduction='mean')
def test_pixel_shuffle(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = torch.nn.functional.pixel_shuffle(inp, 2)
def test_pad(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
pad = (3, 3)
output = F.pad(inp, pad, mode='constant', value=0)
def test_interpolate(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
output = F.interpolate(inp, size=None, scale_factor=2, mode='nearest', align_corners=None)
def test_grid_sample(self):
inp = torch.randn(16, 8, 64, 64, device='cuda', dtype=self.dtype)
grid = torch.randn(16, 32, 32, 2, device='cuda', dtype=self.dtype)
output = F.grid_sample(inp, grid, mode='bilinear', padding_mode='zeros')
def test_affine_grid(self):
theta = torch.randn(32, 2, 3, device='cuda', dtype=self.dtype)
size = (32, 8, 32, 32)
output = F.affine_grid(theta, size)
def run_tests(precision):
dummy = TestPyProfNvtx('test_affine_grid', None)
test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
print("Running tests for {}".format(precision))
suite = unittest.TestSuite()
for test_case in test_cases:
suite.addTest(TestPyProfNvtx(test_case, precision))
unittest.TextTestRunner().run(suite)
if __name__ == '__main__':
run_tests(torch.float32)
run_tests(torch.float16)
#!/bin/bash
APEX_TEST_WITH_ROCM=1 python run_test.py
APEX_TEST_WITH_ROCM=1 APEX_SKIP_FLAKY_TEST=1 python run_test.py
"""L0 Tests Runner.
How to run this script?
1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
2. Run one of the tests (e.g. fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
3. Run two or more of the tests (e.g. optimizers and fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
"""
import argparse
import os
import unittest
import sys
from apex.testing.common_utils import TEST_WITH_ROCM
from apex.testing.common_utils import SKIP_FLAKY_TEST
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
ROCM_BLACKLIST = [
'run_pyprof_nvtx',
'run_pyprof_data',
TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
TEST_DIRS = [
"run_amp",
"run_fp16util",
"run_optimizers",
"run_fused_layer_norm",
"run_mlp",
"run_transformer", # not fully supported on ROCm
]
DEFAULT_TEST_DIRS = [
"run_amp",
"run_fp16util",
"run_optimizers",
"run_fused_layer_norm",
"run_mlp",
]
def parse_args():
parser = argparse.ArgumentParser(
description="L0 test runner",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--include",
nargs="+",
choices=TEST_DIRS,
default=DEFAULT_TEST_DIRS,
help="select a set of tests to run (defaults to ALL tests).",
)
args, _ = parser.parse_known_args()
return args
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
def main(args):
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
for test_dir in args.include:
test_dir = os.path.join(TEST_ROOT, test_dir)
print(test_dir)
suite = unittest.TestLoader().discover(test_dir)
for test_dir in test_dirs:
if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM:
continue
suite = unittest.TestLoader().discover(test_dir)
print("\nExecuting tests from " + test_dir)
result = runner.run(suite)
if not result.wasSuccessful():
errcode = 1
print("\nExecuting tests from " + test_dir)
sys.exit(errcode)
result = runner.run(suite)
if not result.wasSuccessful():
errcode = 1
if __name__ == '__main__':
args = parse_args()
main(args)
sys.exit(errcode)
\ No newline at end of file
import subprocess
import os
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
def run_gpt(cmd):
args = list(cmd.split(" "))
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
outs, errs = p.communicate()
outs = list(str((outs).decode("utf-8")).splitlines())
success = False
runtime = 0
num_params = 0
for out in outs:
out = str(out)
if "Average Iteration Time:" in str(out):
slicey = out[out.find(":") + 2 :]
try:
runtime = float(slicey)
except:
print(slicey)
quit()
if "Number of Parameters:" in str(out):
slicey = out[out.find(":") + 2 :]
try:
num_params = int(slicey)
except:
print(slicey)
quit()
if str(out) == str(TEST_SUCCESS_MESSAGE):
success = True
return runtime, round(float(int(num_params)) / 10.0 ** 9, 3), success, errs
def plot(runtimes):
import matplotlib.pyplot as plt
for distributed_setting in runtimes.keys():
plt.scatter(
runtimes[distributed_setting].keys(),
runtimes[distributed_setting].values(),
label=distributed_setting,
)
plt.legend()
plt.xlabel("Parameters (Billions)")
plt.ylabel("Training Iteration time (s)")
plt.title(str("GPT Scaling w/ Offloading"))
plt.savefig("offload_gpt_scaling.png")
plt.close()
if not os.path.exists("/my_workspace/"):
os.system("mkdir /my_workspace/")
os.system("cp *.png /my_workspace/")
def main():
runtimes = {}
nlist = (
list(range(2000, 10000, 2000))
+ list(range(10000, 50000, 5000))
+ list(range(50000, 100000, 10000))
)
print("N-List:", nlist)
for data_parr, tens_parr, pipe_parr in [(8, 1, 1), (4, 2, 1), (2, 1, 4), (1, 2, 4)]:
for offload in [True, False]:
dist_setting = (
"ddp="
+ str(data_parr)
+ ", tensor_parr="
+ str(tens_parr)
+ ", pipe_parr="
+ str(pipe_parr)
+ ", offload="
+ str(offload)
)
runtimes[dist_setting] = {}
print("Beginning Testing for", dist_setting)
for n in nlist:
cmd = "python3 -m torch.distributed.launch --nproc_per_node=8 run_gpt_minimal_test.py"
cmd += (
" --micro-batch-size 1 --num-layers "
+ str(n)
+ " --hidden-size 128 --num-attention-heads 16"
)
cmd += (
" --max-position-embeddings 128 --seq-length 128 --tensor-model-parallel-size "
+ str(tens_parr)
)
cmd += (
" --pipeline-model-parallel-size "
+ str(pipe_parr)
+ (" --cpu-offload" if offload else "")
)
print(cmd)
runtime, bill_params, success, errs = run_gpt(cmd)
if success:
runtimes[dist_setting][bill_params] = runtime
print(
str(runtime) + "s per training iter for",
str(bill_params) + "B parameter GPT-2",
)
if n >= 10000:
plot(runtimes)
else:
print("GPT-2 w/", n, "layers failed using", dist_setting)
print("Moving on to the next distributed setting...")
print("#" * (25))
print()
plot(runtimes)
break
print(runtimes)
plot(runtimes)
if __name__ == "__main__":
main()
import random
import torch
try:
import torch_ucc
except ImportError:
HAS_TORCH_UCC = False
else:
HAS_TORCH_UCC = True
print("Use UCC as backend of Pipeline Parallel ProcessGroups")
from apex.transformer.enums import ModelType
from apex.transformer import tensor_parallel
from apex.transformer import parallel_state
from apex.transformer.log_util import set_logging_level
from apex.transformer.tensor_parallel import vocab_parallel_cross_entropy
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import unwrap_model
from apex.transformer.pipeline_parallel.utils import (
average_losses_across_data_parallel_group,
)
from apex.transformer.pipeline_parallel.schedules import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import (
_get_params_for_weight_decay_optimization,
)
from apex.transformer.testing.standalone_bert import bert_model_provider
from apex.transformer.testing.standalone_bert import bert_model_provider
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
import warnings
class DebugWarning(Warning):
pass
set_logging_level("WARNING")
mode = None
MANUAL_SEED = 42
inds = None
......@@ -26,92 +48,130 @@ EASY_MODE = False
EASY_MODE_SIZ = 32
ONCE = False
# download a public domain book as corpus
def download_fancy_data():
#import requests
#response = requests.get('https://internet.com/book.txt')
#text = ' '.join(response.text.split())
text = """
# import requests
# response = requests.get('https://internet.com/book.txt')
# text = ' '.join(response.text.split())
text = """
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
"""
text = text*1024
encoded = text.encode('ascii', 'replace')
ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints)
text = text * 1024
encoded = text.encode("ascii", "replace")
ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints)
# build a batch given sequence_len and batch size
def generate_fancy_data_labels(sequence_len, batch_size):
global data_idx
global inds
global masks
global MANUAL_SEED
temps = list()
for i in range(batch_size):
if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different
torch.manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device='cuda')
masks = (torch.rand(len(inds)//batch_size + 1, batch_size, sequence_len, device='cuda') >= MASK_PROB).long()
MANUAL_SEED += 1
print("new epoch", len(inds))
data_idx = 0
print("my start", inds[0:5])
print("masks_checksum:", torch.sum(masks))
if EASY_MODE:
data_idx_ = data_idx % EASY_MODE_SIZ
else:
data_idx_ = data_idx
offset = inds[data_idx_] #* SEQUENCE_LEN
data_idx += 1
curr = fancy_data[offset:offset+sequence_len].clone().detach()
temps.append(curr)
temp = torch.stack(temps, dim=0).cuda()
mask = masks[data_idx//batch_size]
mask_not = torch.logical_not(mask)
data = mask * temp + mask_not*124
label = temp
return (data, label, mask_not)
global data_idx
global inds
global masks
global MANUAL_SEED
temps = []
for i in range(batch_size):
if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different
torch.manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device="cuda")
masks = (
torch.rand(
len(inds) // batch_size + 1, batch_size, sequence_len, device="cuda"
)
>= MASK_PROB
).long()
MANUAL_SEED += 1
print("new epoch", len(inds))
data_idx = 0
print("my start", inds[0:5])
print("masks_checksum:", torch.sum(masks))
if EASY_MODE:
data_idx_ = data_idx % EASY_MODE_SIZ
else:
data_idx_ = data_idx
offset = inds[data_idx_] # * SEQUENCE_LEN
data_idx += 1
curr = fancy_data[offset : offset + sequence_len].clone().detach()
temps.append(curr)
temp = torch.stack(temps, dim=0).cuda()
mask = masks[data_idx // batch_size]
mask_not = torch.logical_not(mask).long()
data = mask * temp + mask_not * 124
label = temp
if parallel_state.get_tensor_model_parallel_rank() == 0:
data_dict = {"text": data, "label": label, "mask_not": mask_not}
else:
data_dict = None
keys = ["text", "label", "mask_not"]
dtype = torch.int64
broadcasted_data = tensor_parallel.broadcast_data(keys, data_dict, torch.long)
return (
broadcasted_data["text"].long(),
broadcasted_data["label"].long(),
broadcasted_data["mask_not"],
)
easy_data = None
def fwd_step_func(batch, model):
data, label, loss_mask = batch
data = data.cuda()
label = label.cuda()
loss_mask = loss_mask.cuda()
y = model(data, torch.ones_like(data), lm_labels=label)
def loss_func(output_tensor):
global ONCE
output_tensor, _ = output_tensor
lm_loss_ = output_tensor.float()
lm_loss = torch.sum(
lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
averaged_loss = average_losses_across_data_parallel_group([lm_loss])
if data_idx >= 1536:
assert lm_loss < 4.8
assert averaged_loss < 4.8
if not ONCE:
print("LOSS OK")
ONCE = True
return lm_loss, {'avg': averaged_loss}
return lm_loss, {"avg": averaged_loss}
return y, loss_func
def train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size):
def train(
model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size, async_comm
):
sequence_len = global_vars.get_args().seq_length
micro_batch_size = global_vars.get_args().micro_batch_size
hidden_size = global_vars.get_args().hidden_size
forward_backward_func = get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
forward_backward_func = get_forward_backward_func(
virtual_pipeline_model_parallel_size, pipeline_model_parallel_size
)
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
for _ in range(8):
for _ in range(16):
batch = generate_fancy_data_labels(sequence_len, batch_size)
optim.zero_grad()
forward_backward_func(fwd_step_func, batch, model, forward_only=False, tensor_shape=tensor_shape)
forward_backward_func(
fwd_step_func,
batch,
model,
forward_only=False,
tensor_shape=tensor_shape,
async_comm=async_comm,
sequence_parallel_enabled=global_vars.get_args().sequence_parallel,
)
# All-reduce layernorm parameters across model parallel nodes
# when sequence parallelism is used
if parallel_state.get_tensor_model_parallel_world_size() > 1 and global_vars.get_args().sequence_parallel:
for model_module in model:
unwrapped_model = unwrap_model(model_module)
for param in unwrapped_model.parameters():
if getattr(param, 'sequence_parallel_enabled', False):
grad = param.grad
torch.distributed.all_reduce(grad, group=parallel_state.get_tensor_model_parallel_group())
optim.step()
if __name__ == '__main__':
if __name__ == "__main__":
global fancy_data
global effective_length
......@@ -121,50 +181,80 @@ if __name__ == '__main__':
effective_length = fancy_data.size(0) // global_vars.get_args().seq_length
effective_length = fancy_data.size(0) - global_vars.get_args().seq_length
initialize_distributed()
initialize_distributed("nccl")
world_size = torch.distributed.get_world_size()
failure = None
init = True
try:
args = global_vars.get_args()
args.padded_vocab_size = 128 # needed in standalone gpt
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
virtual_pipeline_model_parallel_size = 2
world_size = torch.distributed.get_world_size()
pipeline_model_parallel_size = world_size
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
tensor_parallel.random.model_parallel_cuda_manual_seed(0)
model = build_model(
bert_model_provider,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
assert isinstance(model, list)
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
print(effective_length)
print(fancy_data.size(0))
train(model, optim, virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
virtual_pipeline_model_parallel_sizes = (None, 2,)
if HAS_TORCH_UCC:
# Deliberately skipping test with interleaved schedule for BERT model.
# It deadlocks on hybrid UCC/NCCL backend.
virtual_pipeline_model_parallel_sizes = (None,)
for virtual_pipeline_model_parallel_size in virtual_pipeline_model_parallel_sizes:
args = global_vars.get_args()
async_comm = not args.sequence_parallel and virtual_pipeline_model_parallel_size is None
data_idx = 0
ONCE = False
if init:
init = False
args = global_vars.get_args()
args.padded_vocab_size = 128 # needed in standalone gpt
args.model_type = ModelType.encoder_or_decoder
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
args.data_parallel_size,
)
else:
parallel_state.destroy_model_parallel()
parallel_state.initialize_model_parallel(
args.tensor_model_parallel_size,
args.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size,
default_backend="nccl",
p2p_backend="ucc" if HAS_TORCH_UCC else "nccl",
)
pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
tensor_parallel.random.model_parallel_cuda_manual_seed(0)
model = build_model(
bert_model_provider,
wrap_with_ddp=parallel_state.get_data_parallel_world_size() > 1,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
cpu_offload=args.cpu_offload,
)
assert isinstance(model, list)
assert len(model) == (
1
if virtual_pipeline_model_parallel_size is None
else virtual_pipeline_model_parallel_size
)
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
print(effective_length)
print(fancy_data.size(0))
train(
model,
optim,
virtual_pipeline_model_parallel_size,
args.pipeline_model_parallel_size,
async_comm,
)
except Exception as e:
failure = str(e)
finally:
parallel_state.destroy_model_parallel()
if failure is not None:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(f"Minimal BERT Pipeline Parallel Failed with: {failure}")
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
warnings.warn(
f"Minimal BERT Pipeline Parallel Failed with: {failure}", DebugWarning
)
print(f"Minimal BERT Pipeline Parallel Failed with: {failure}")
torch.distributed.barrier()
print(TEST_SUCCESS_MESSAGE)
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.functional as F
from apex.transformer import parallel_state
from apex.transformer import tensor_parallel
from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import set_random_seed
from apex.transformer.testing.commons import IdentityLayer
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def torch_cross_entropy(batch_size, seq_length, vocab_size,
logits_scale, seed):
set_random_seed(seed)
identity = IdentityLayer((batch_size, seq_length, vocab_size),
scale=logits_scale).cuda()
logits = identity()
target = torch.cuda.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size)
loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
target.view(-1),
reduction='none').view_as(target).mean()
loss.backward()
return loss, identity.weight.grad
def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
set_random_seed(seed)
identity = IdentityLayer((batch_size, seq_length, vocab_size), scale=logits_scale).cuda()
logits = identity()
logits_parallel = tensor_parallel.scatter_to_tensor_model_parallel_region(logits)
target = torch.cuda.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size)
logits_parallel_ = logits_parallel.clone().detach()
loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
loss.backward()
# check for mutation
assert torch.equal(logits_parallel_, logits_parallel)
return loss, identity.weight.grad
def test_cross_entropy(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing cross entropy with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
batch_size = 13
seq_length = 17
vocab_size_per_partition = 11
logits_scale = 1000.0
vocab_size = vocab_size_per_partition * tensor_model_parallel_size
seed = 1234
loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
loss_mpu, grad_mpu = tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
error = loss_torch.sub_(loss_mpu).abs().max()
print(' max error in loss on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = grad_torch.sub_(grad_mpu).abs().max()
print(' max error in grad on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test cross entropy')
test_cross_entropy(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import operator
import torch
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import data as data_utils
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_broadcast_data(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing broadcast_data with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
torch.manual_seed(1234 + parallel_state.get_data_parallel_rank())
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
key_size_t = {
'key1': [7, 11],
'key2': [8, 2, 1],
'key3': [13],
'key4': [5, 1, 2],
'key5': [5, 12],
}
keys = list(key_size_t.keys())
data = {}
data_t = {}
for key in key_size_t:
data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
data_t[key] = data[key].clone()
data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
data_t['keyX'] = data['keyX'].clone()
if parallel_state.get_tensor_model_parallel_rank() != 0:
data = None
data_utils._check_data_types(keys, data_t, torch.int64)
key_size, key_numel, \
total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
for key in keys:
assert key_size[key] == key_size_t[key]
total_numel_t = 0
for key in keys:
target_size = functools.reduce(operator.mul, key_size_t[key], 1)
assert key_numel[key] == target_size
total_numel_t += target_size
assert total_numel == total_numel_t
data_b = data_utils.broadcast_data(keys, data, torch.int64)
for key in keys:
tensor = data_t[key].cuda()
assert data_b[key].sub(tensor).abs().max() == 0
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test test broadcast data')
test_broadcast_data(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
......@@ -11,7 +11,6 @@ from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import (
_forward_backward_pipelining_with_interleaving,
)
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
......@@ -19,6 +18,7 @@ from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import fwd_step_func
from apex.transformer.log_util import get_transformer_logger, set_logging_level
from apex.transformer.testing.commons import model_provider_func
from apex.transformer._data import MegatronPretrainingRandomSampler
......@@ -31,7 +31,7 @@ _logger = get_transformer_logger("pipeline_parallel_test")
# note(mkozuki): To see if local batch size increases, uncomment the line below
# _logger.setLevel("INFO")
global_vars.set_global_variables(
args_defaults={"global_batch_size": 512, "rampup_batch_size": [32, 32, 1000],},
args_defaults={"global_batch_size": 512, "rampup_batch_size": [64, 64, 1000],},
ignore_unknown_args=True,
)
......@@ -44,29 +44,13 @@ HIDDEN_SIZE = 16
def Dataset(num_samples: int) -> List[Tuple[torch.Tensor, torch.Tensor]]:
return [(torch.randn(HIDDEN_SIZE), torch.randn(HIDDEN_SIZE // 2)) for _ in range(num_samples)]
def process_batch(batch):
if isinstance(batch, (list, tuple)):
x = batch[0]
else:
x = batch
return x
def fwd_step_func(micro_batch, model):
x = process_batch(micro_batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {"avg": averaged_loss}
return y, loss_func
return [
(
torch.randn(HIDDEN_SIZE, HIDDEN_SIZE),
torch.randn(HIDDEN_SIZE // 2, HIDDEN_SIZE // 2),
)
for _ in range(num_samples)
]
# Run forward & backward with dynamic batch size.
......@@ -88,9 +72,13 @@ def run_interleaved_with_dynamic_batch_size(
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size
)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
print_separator(f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}")
print_separator(
f"BatchSamplerCls: {BatchSamplerCls.__name__}, forward_only: {forward_only}"
)
model = build_model(
model_provider_func,
......@@ -122,7 +110,7 @@ def run_interleaved_with_dynamic_batch_size(
assert isinstance(batch, (list, tuple))
return [get_num_samples(b) for b in batch]
tensor_shape = [micro_batch_size, HIDDEN_SIZE]
tensor_shape = [micro_batch_size, HIDDEN_SIZE, HIDDEN_SIZE]
consumed_samples = 0
for i in range(NUM_ITERATIONS):
update_num_microbatches(consumed_samples, consistency_check=False)
......@@ -180,7 +168,10 @@ if __name__ == "__main__":
args.micro_batch_size,
1, # args.data_parallel_size,
)
for BatchSamplerCls in (MegatronPretrainingSampler, MegatronPretrainingRandomSampler):
for BatchSamplerCls in (
MegatronPretrainingSampler,
MegatronPretrainingRandomSampler,
):
for forward_only in (False, True):
n_tests += 1
pipeline_model_parallel_size = world_size
......
from functools import partial
from typing import List
import time
import torch
try:
import torch_ucc
except ImportError:
HAS_TORCH_UCC = False
else:
HAS_TORCH_UCC = True
print("Use UCC as backend of Pipeline Parallel ProcessGroups")
from apex.transformer import parallel_state
from apex.transformer.enums import ModelType
from apex.transformer.tensor_parallel import model_parallel_cuda_manual_seed
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import unwrap_model
from apex.transformer.pipeline_parallel.utils import (
average_losses_across_data_parallel_group,
)
from apex.transformer.pipeline_parallel.utils import get_ltor_masks_and_position_ids
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.common import (
_get_params_for_weight_decay_optimization,
)
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import (
forward_backward_pipelining_without_interleaving,
)
from apex.transformer.testing.standalone_gpt import gpt_model_provider
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
MANUAL_SEED = 42
inds = None
data_idx = 0
N_VOCAB = 128
def download_fancy_data():
# import requests
# response = requests.get('https://internet.com/book.txt')
# text = ' '.join(response.text.split())
text = """
An original sentence not subject to any license restrictions, copyright, or royalty payments. Nothing to see here. Commercial or non-commercial use. Research or non-research purposes. The quick brown fox jumps over the lazy dog. Lorem ipsum.
"""
text = text * 1024
encoded = text.encode("ascii", "replace")
ints = [int(encoded[i]) for i in range(len(encoded))]
return torch.tensor(ints)
# build a batch given sequence_len and batch size
def generate_fancy_data_labels(sequence_len, batch_size):
global data_idx
global inds
global MANUAL_SEED
temps = list()
for i in range(batch_size):
if inds is None or data_idx >= len(inds):
# hack as use of RNG will fall out of sync due to pipelines being different
model_parallel_cuda_manual_seed(MANUAL_SEED)
inds = torch.randperm(effective_length, device="cuda")
MANUAL_SEED += 1
data_idx = 0
data_idx_ = data_idx
offset = inds[data_idx_]
data_idx += 1
curr = fancy_data[offset : offset + sequence_len + 1].clone().detach()
temps.append(curr)
temp = torch.stack(temps, dim=0).cuda()
return temp
easy_data = None
def get_batch(int_tensors: List[torch.Tensor]):
data = int_tensors[0]
# Unpack.
tokens_ = data.long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and position ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
N_VOCAB, # tokenizer.eod,
False, # args.reset_position_ids,
False, # args.reset_attention_mask,
False, # args.eod_mask_loss,
)
return tokens, labels, loss_mask, attention_mask, position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {"lm loss": averaged_loss[0]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
def fwd_step_func(batch, model):
"""Forward step."""
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(batch)
output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def train(model, optim, pipeline_model_parallel_size, async_comm):
sequence_len = global_vars.get_args().seq_length
micro_batch_size = global_vars.get_args().micro_batch_size
hidden_size = global_vars.get_args().hidden_size
fwd_bwd_func = forward_backward_pipelining_without_interleaving
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
runtime = 0
# training loop
for i in range(3):
since = time.time()
if torch.distributed.get_rank() == 0:
print("begin iter", i)
batch = [
generate_fancy_data_labels(args.seq_length, args.global_batch_size)
for _ in range(pipeline_model_parallel_size)
]
if torch.distributed.get_rank() == 0:
print("finished making batch...")
optim.zero_grad()
fwd_bwd_func(
fwd_step_func,
batch,
model,
forward_only=False,
tensor_shape=tensor_shape,
async_comm=async_comm,
sequence_parallel_enabled=args.sequence_parallel,
)
if torch.distributed.get_rank() == 0:
print("finished forward step")
# All-reduce layernorm parameters across model parallel nodes
# when sequence parallelism is used
if parallel_state.get_tensor_model_parallel_world_size() > 1 and global_vars.get_args().sequence_parallel:
for model_module in model:
unwrapped_model = unwrap_model(model_module)
for param in unwrapped_model.parameters():
if getattr(param, 'sequence_parallel_enabled', False):
grad = param.grad
torch.distributed.all_reduce(grad, group=parallel_state.get_tensor_model_parallel_group())
optim.step()
if torch.distributed.get_rank() == 0:
print("finished iter", i)
runtime += time.time() - since
return runtime / 3.0
if __name__ == "__main__":
init = True
global_vars.set_global_variables()
for async_comm in (False,) if global_vars.get_args().sequence_parallel else (False, True):
global fancy_data
global effective_length
if init:
init = False
fancy_data = download_fancy_data()
args = global_vars.get_args()
args.model_type = ModelType.encoder_or_decoder
effective_length = fancy_data.size(0) // args.seq_length
effective_length = fancy_data.size(0) - args.seq_length
initialize_distributed("nccl")
world_size = torch.distributed.get_world_size()
failure = None
args.padded_vocab_size = 128
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
args.data_parallel_size, # args.data_parallel_size,
)
world_size = torch.distributed.get_world_size()
print(args.tensor_model_parallel_size, "MODEL PARALLEL SIZE")
parallel_state.initialize_model_parallel(
tensor_model_parallel_size_=args.tensor_model_parallel_size,
pipeline_model_parallel_size_=args.pipeline_model_parallel_size,
default_backend="nccl",
p2p_backend="ucc" if HAS_TORCH_UCC else "nccl",
)
pipeline_model_parallel_size = (
parallel_state.get_pipeline_model_parallel_world_size()
)
model_parallel_cuda_manual_seed(0)
model = build_model(
gpt_model_provider,
wrap_with_ddp=parallel_state.get_data_parallel_world_size() > 1,
virtual_pipeline_model_parallel_size=None,
cpu_offload=args.cpu_offload,
)
assert isinstance(model, list), model
_param_groups = _get_params_for_weight_decay_optimization(model)
optim = torch.optim.Adam(_param_groups)
runtime = train(model, optim, args.pipeline_model_parallel_size, async_comm)
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
print("Average Iteration Time:", runtime)
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex.transformer import parallel_state
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_initialize_model_parallel(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing initialize_model_parallel with size {} ...'.format(
tensor_model_parallel_size))
tensor_model_parallel_size_ = min(
tensor_model_parallel_size,
torch.distributed.get_world_size(),
)
assert not parallel_state.model_parallel_is_initialized()
parallel_state.initialize_model_parallel(tensor_model_parallel_size_)
assert parallel_state.model_parallel_is_initialized()
# Checks.
def check(group, world_size, rank):
assert world_size == torch.distributed.get_world_size(group=group)
assert rank == torch.distributed.get_rank(group=group)
# Model parallel.
world_size = tensor_model_parallel_size_
rank = torch.distributed.get_rank() % tensor_model_parallel_size_
assert world_size == parallel_state.get_tensor_model_parallel_world_size()
assert rank == parallel_state.get_tensor_model_parallel_rank()
check(parallel_state.get_tensor_model_parallel_group(), world_size, rank)
# Data parallel.
world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
rank = torch.distributed.get_rank() // tensor_model_parallel_size
assert world_size == parallel_state.get_data_parallel_world_size()
assert rank == parallel_state.get_data_parallel_rank()
check(parallel_state.get_data_parallel_group(), world_size, rank)
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
if torch.distributed.get_rank() == 0:
print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
tensor_model_parallel_size_))
tensor_model_parallel_size = min(
tensor_model_parallel_size_,
torch.distributed.get_world_size(),
)
assert not parallel_state.model_parallel_is_initialized()
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
assert parallel_state.model_parallel_is_initialized()
# Checks
src_rank = torch.distributed.get_rank() - parallel_state.get_tensor_model_parallel_rank()
assert parallel_state.get_tensor_model_parallel_src_rank() == src_rank
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print('>> passed the test :-)')
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test initialize model parallel')
test_initialize_model_parallel(tensor_model_parallel_size)
print_separator('test model parallel source rank')
test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.init as init
from torch.nn.parameter import Parameter
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import layers
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import set_random_seed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
class IdentityLayer3D(torch.nn.Module):
def __init__(self, m, n, k):
super(IdentityLayer3D, self).__init__()
self.weight = Parameter(torch.Tensor(m, n, k))
torch.nn.init.xavier_normal_(self.weight)
def forward(self):
return self.weight
def test_parallel_embedding(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing parallel embedding with model parallel size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
batch_size = 17
seq_length = 23
vocab_size = 48
hidden_size = 16
seed = 1236
set_random_seed(123)
input_data = torch.LongTensor(
size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
set_random_seed(seed)
embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
output = embedding_original(input_data)
loss_original = torch.mul(output, loss_weight).sum()
loss_original.backward()
set_random_seed(seed)
embedding_parallel = layers.ParallelEmbedding(
vocab_size, hidden_size, init_method=init.normal_).cuda()
output = embedding_parallel(input_data)
loss_parallel = torch.mul(output, loss_weight).sum()
loss_parallel.backward()
set_random_seed(seed)
embedding_vocab_parallel = layers.VocabParallelEmbedding(
vocab_size, hidden_size, init_method=init.normal_).cuda()
output = embedding_vocab_parallel(input_data)
loss_vocab_parallel = torch.mul(output, loss_weight).sum()
loss_vocab_parallel.backward()
torch.distributed.barrier()
error = loss_parallel.sub(loss_original).abs()
print(' error in loss (parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
torch.distributed.barrier()
error = loss_vocab_parallel.sub(loss_original).abs()
print(' error in loss (vocab parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
weight_grad_orig = torch.split(embedding_original.weight.grad,
hidden_size // tensor_model_parallel_size,
1)[parallel_state.get_tensor_model_parallel_rank()]
error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
print(' error in grad (parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
weight_grad_orig = torch.split(embedding_original.weight.grad,
vocab_size // tensor_model_parallel_size,
0)[parallel_state.get_tensor_model_parallel_rank()]
error = embedding_vocab_parallel.weight.grad.sub(
weight_grad_orig).abs().max()
print(' error in grad (vocab parallel) on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-12, 'error: {}'.format(error)
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print('>> passed the test :-)')
def test_initialize_affine_weight(tensor_model_parallel_size, device):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing initialize_affine_weight with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
# ---------------
# Column parallel
# ---------------
weight = torch.empty(output_size_coeff, input_size)
set_random_seed(seed)
if device == 'cpu':
layers._initialize_affine_weight_cpu(weight, output_size, input_size,
output_size_coeff, 0,
torch.nn.init.normal_,
params_dtype=global_vars.get_args().params_dtype,
)
else:
layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 0)
# Target.
set_random_seed(seed)
master_weight = torch.empty(output_size, input_size)
torch.nn.init.normal_(master_weight)
rank = parallel_state.get_tensor_model_parallel_rank()
my_weight = torch.split(master_weight, output_size_coeff,
dim=0)[rank].contiguous().clone()
# Compare.
error = weight.sub(my_weight).abs().max()
torch.distributed.barrier()
print(' column parallel max error (should be zero) on global rank '
'{}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# ------------
# Row parallel
# ------------
weight = torch.empty(output_size, input_size_coeff)
set_random_seed(seed)
if device == 'cpu':
layers._initialize_affine_weight_cpu(
weight, output_size, input_size, input_size_coeff, 1, torch.nn.init.normal_,
params_dtype=global_vars.get_args().params_dtype)
else:
layers._initialize_affine_weight_gpu(weight, torch.nn.init.normal_, 1)
# Target.
set_random_seed(seed)
master_weight = torch.empty(output_size, input_size)
torch.nn.init.normal_(master_weight)
rank = parallel_state.get_tensor_model_parallel_rank()
my_weight = torch.split(master_weight, input_size_coeff,
dim=1)[rank].contiguous().clone()
# Compare.
error = weight.sub(my_weight).abs().max()
torch.distributed.barrier()
print(' row parallel max error (should be zero) on global rank '
'{}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
class IdentityLayer2D(torch.nn.Module):
def __init__(self, m, n):
super(IdentityLayer2D, self).__init__()
self.weight = Parameter(torch.Tensor(m, n))
torch.nn.init.xavier_normal_(self.weight)
def forward(self):
return self.weight
def test_column_parallel_linear(tensor_model_parallel_size):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing ColumnParallelLinear with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
loss_weight = torch.randn([batch_size, output_size]).cuda()
# Forward
input_ = identity_layer()
output, _ = linear_layer(input_)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
# Values.
dLdY = loss_weight
X = identity_layer.weight
A = linear_layer.master_weight.cuda()
dLdA = torch.matmul(dLdY.t(), X)
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
dLdX = torch.matmul(dLdY, A)
rank = parallel_state.get_tensor_model_parallel_rank()
my_dLdA = torch.split(dLdA, output_size_coeff,
dim=0)[rank].contiguous().clone()
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdA on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
my_dLdb = torch.split(dLdb, output_size_coeff,
dim=0)[rank].contiguous().clone()
error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdb on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdX.sub(identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdX on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_column_parallel_linear_with_async_allreduce_autocast(tensor_model_parallel_size):
autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer3D(batch_size, batch_size, input_size).cuda()
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
assert linear_layer.async_tensor_model_parallel_allreduce or tensor_model_parallel_size == 1
# Forward
for dtype in autocast_dtypes:
loss_weight = torch.randn([batch_size, output_size]).cuda()
with torch.cuda.amp.autocast(dtype=dtype):
output, _ = linear_layer(identity_layer())
loss = torch.mul(output, loss_weight).sum()
assert output.dtype == dtype
# Backward
loss.backward()
torch.distributed.barrier()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_column_parallel_linear_with_async_allreduce_custom_amp(tensor_model_parallel_size):
dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
for dtype in dtypes:
# Network
identity_layer = IdentityLayer3D(batch_size, batch_size, input_size).to(device="cuda", dtype=dtype)
linear_layer = layers.ColumnParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).to(device="cuda", dtype=dtype)
# Forward
loss_weight = torch.randn([batch_size, output_size]).cuda()
output, _ = linear_layer(identity_layer())
loss = torch.mul(output, loss_weight).sum()
loss.backward()
torch.distributed.barrier()
assert output.dtype == dtype
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def test_row_parallel_linear(tensor_model_parallel_size):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
if torch.distributed.get_rank() == 0:
print('> testing RowParallelLinear with model parallel '
'size: {}'.format(tensor_model_parallel_size))
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
input_size_coeff = 13
input_size = input_size_coeff * tensor_model_parallel_size
output_size_coeff = 17
output_size = output_size_coeff * tensor_model_parallel_size
batch_size = 7
# Network
identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
linear_layer = layers.RowParallelLinear(
input_size, output_size, keep_master_weight_for_test=True,
params_dtype=global_vars.get_args().params_dtype,
use_cpu_initialization=global_vars.get_args().use_cpu_initialization,
).cuda()
loss_weight = torch.randn([batch_size, output_size]).cuda()
# Forward
input_ = identity_layer()
output, _ = linear_layer(input_)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
# Values.
dLdY = loss_weight
X = identity_layer.weight
A = linear_layer.master_weight.cuda()
dLdA = torch.matmul(dLdY.t(), X)
dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
dLdX = torch.matmul(dLdY, A)
rank = parallel_state.get_tensor_model_parallel_rank()
my_dLdA = torch.split(dLdA, input_size_coeff,
dim=1)[rank].contiguous().clone()
error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdA on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdb.sub(linear_layer.bias.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdb on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
error = dLdX.sub(identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' error in dLdX on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size,
sequence_length):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
num_att_heads = num_att_heads_per_partition * \
torch.distributed.get_world_size()
hidden_size = hidden_size_per_att_head * num_att_heads
# Network
identity_layer = IdentityLayer3D(batch_size, sequence_length,
hidden_size).cuda()
attention_layer = parallel_state.BertParallelSelfAttention(hidden_size, num_att_heads,
dropout_prob).cuda()
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
# Forward
input_ = identity_layer()
output = attention_layer(input_, attention_mask)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
rank = parallel_state.get_tensor_model_parallel_rank()
parallel_state.destroy_model_parallel()
return rank, hidden_size, tensor_model_parallel_size, loss, \
attention_layer, identity_layer
def test_parallel_self_attention(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing ParallelSelfAttention with model parallel '
'size: {}'.format(tensor_model_parallel_size))
num_att_heads_per_partition = 3
hidden_size_per_att_head = 7
dropout_prob = 0.0 # has to be zero
batch_size = 5
sequence_length = 13
rank_1, hideen_size_1, tensor_model_parallel_size_1, loss_1, \
attention_layer_1, identity_layer_1 = parallel_self_attention(
1, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
rank, hidden_size, tensor_model_parallel_size, loss, \
attention_layer, identity_layer = parallel_self_attention(
tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
assert hideen_size_1 == hidden_size
error = loss_1.sub(loss).abs().max()
torch.distributed.barrier()
print(' loss error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
my_lin_grad_list = torch.split(
attention_layer_1.query_key_value.weight.grad,
hidden_size // tensor_model_parallel_size, 0)[rank::tensor_model_parallel_size]
my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
error = my_lin_grad.sub(
attention_layer.query_key_value.weight.grad).abs().max()
torch.distributed.barrier()
print(' weight gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
error = identity_layer_1.weight.grad.sub(
identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' input gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-6
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(' >> passed the test :-)')
def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length):
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed = 12345
set_random_seed(seed)
num_att_heads = num_att_heads_per_partition * \
torch.distributed.get_world_size()
hidden_size = hidden_size_per_att_head * num_att_heads
intermediate_size = 4 * hidden_size
# Network
identity_layer = IdentityLayer3D(batch_size, sequence_length,
hidden_size).cuda()
transformer_layer = parallel_state.BertParallelTransformerLayer(
hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
torch.nn.functional.relu, 1.0e-5).cuda()
loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
# Forward
input_ = identity_layer()
output = transformer_layer(input_, attention_mask)
loss = torch.mul(output, loss_weight).sum()
# Backward
loss.backward()
rank = parallel_state.get_tensor_model_parallel_rank()
parallel_state.destroy_model_parallel()
return rank, hidden_size, tensor_model_parallel_size, loss, \
transformer_layer, identity_layer
def test_parallel_transformer_layer(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing ParallelTransformerLayer with model parallel '
'size: {}'.format(tensor_model_parallel_size))
num_att_heads_per_partition = 3
hidden_size_per_att_head = 7
batch_size = 5
sequence_length = 13
rank_1, hidden_size_1, tensor_model_parallel_size_1, loss_1, \
transformer_layer_1, identity_layer_1 = parallel_transformer(
1, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length)
rank, hidden_size, tensor_model_parallel_size, loss, \
transformer_layer, identity_layer = parallel_transformer(
tensor_model_parallel_size, num_att_heads_per_partition,
hidden_size_per_att_head, batch_size, sequence_length)
error = loss_1.sub(loss).abs().max()
torch.distributed.barrier()
print(' loss error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-5, 'error: {}'.format(error)
error = identity_layer_1.weight.grad.sub(
identity_layer.weight.grad).abs().max()
torch.distributed.barrier()
print(' input gradient error on global rank {}: {}'.format(
torch.distributed.get_rank(), error))
assert error < 5.0e-5, 'error: {}'.format(error)
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
exceptions = []
print_separator('test initialize affine weight cpu')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_initialize_affine_weight(tensor_model_parallel_size, 'cpu')
except Exception as e:
exceptions.append(f"test_initialize_affine_weight-cpu with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
# Reset groups
parallel_state.destroy_model_parallel()
print_separator('test initialize affine weight gpu')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_initialize_affine_weight(tensor_model_parallel_size, 'gpu')
except Exception as e:
exceptions.append(f"test_initialize_affine_weight-gpu with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
# Deleted, replaced with vocab parallel embedding?
#tensor_model_parallel_size = 1
#while tensor_model_parallel_size <= world_size:
# print_separator('test parallel embedding')
# test_parallel_embedding(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
print_separator('test column-parallel linear')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator('test row-parallel linear')
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
try:
test_row_parallel_linear(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_row_parallel_linear with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator("test ColumnParallelLinearWithAsyncAllreduce - autocast")
tensor_model_parallel_size = 2
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear_with_async_allreduce_autocast(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear_with_async_allreduce_autocast with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
print_separator("test ColumnParallelLinearWithAsyncAllreduce - custom AMP")
tensor_model_parallel_size = 2
while tensor_model_parallel_size <= world_size:
try:
test_column_parallel_linear_with_async_allreduce_custom_amp(tensor_model_parallel_size)
except Exception as e:
exceptions.append(f"test_column_parallel_linear_with_async_allreduce_custom_amp with tensor model parallel size of {tensor_model_parallel_size} failed: {str(e)}")
# Reset groups
parallel_state.destroy_model_parallel()
break
else:
tensor_model_parallel_size *= 2
if exceptions:
raise RuntimeError("\n".join(exceptions))
# Deleted
#print_separator('test parallel self-attention')
#tensor_model_parallel_size = 1
#while tensor_model_parallel_size <= world_size:
# test_parallel_self_attention(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
#Deleted because PararallelTransformerLayer no longer exists
# print_separator('test parallel transformer')
# tensor_model_parallel_size = 1
# while tensor_model_parallel_size <= world_size:
# test_parallel_transformer_layer(tensor_model_parallel_size)
# tensor_model_parallel_size *= 2
import torch
from apex.transformer import parallel_state
from apex.transformer.tensor_parallel import mappings
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import initialize_distributed
global_vars.set_global_variables()
def test__reduce(args, tensor_model_parallel_size):
print("Testing reduction size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
assert torch.equal(
mappings._reduce(torch.full((10, 10, 10, 10), (50))),
torch.full((10, 10, 10, 10), 50 * tensor_model_parallel_size),
)
parallel_state.destroy_model_parallel()
print("Passed!")
def test__split(args, tensor_model_parallel_size):
print("Testing splitting size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
listy = []
for i in range(tensor_model_parallel_size):
listy.append(torch.randn(10, 1))
x = torch.cat(tuple(listy), 1)
out = mappings._split(x)
assert torch.equal(out, listy[parallel_state.get_tensor_model_parallel_rank()])
parallel_state.destroy_model_parallel()
print("Passed!")
def test__gather(args, tensor_model_parallel_size):
print("Testing gathering size =", tensor_model_parallel_size)
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
assert torch.equal(
mappings._gather(torch.tensor([parallel_state.get_tensor_model_parallel_rank()])),
torch.tensor(list(range(tensor_model_parallel_size))),
)
parallel_state.destroy_model_parallel()
print("Passed!")
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
test__reduce(args, tensor_model_parallel_size)
test__split(args, tensor_model_parallel_size)
test__gather(args, tensor_model_parallel_size)
tensor_model_parallel_size *= 2
print(">> passed the test :-)")
from functools import partial
import logging
from typing import List
import torch
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import get_ltor_masks_and_position_ids
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.tensor_parallel import model_parallel_cuda_manual_seed
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.standalone_gpt import gpt_model_provider
from apex.transformer.log_util import get_transformer_logger, set_logging_level
set_logging_level(logging.NOTSET)
_logger = get_transformer_logger("megatron_gpt_pipeline_test")
global_vars.set_global_variables()
N_VOCAB = 8192
def generate_batch(batch_size, sequence_length):
size = batch_size, sequence_length + 1
int_tensor = torch.randint(low=0, high=N_VOCAB, size=size, dtype=torch.long).cuda()
return int_tensor,
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L44
def get_batch(int_tensors: List[torch.Tensor]):
data = int_tensors[0]
# Unpack.
tokens_ = data.long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and position ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
N_VOCAB, # tokenizer.eod,
False, # args.reset_position_ids,
False, # args.reset_attention_mask,
False, # args.eod_mask_loss,
)
return tokens, labels, loss_mask, attention_mask, position_ids
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L75
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
# Ref: https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/pretrain_gpt.py#L86
# TODO (mkozuki): Currently I'm seeing no attribute `word_embeddings` which looks weird.
def forward_step(batch, model):
"""Forward step."""
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(batch)
output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def run_gpt(pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=None, forward_only=False):
parallel_state.initialize_model_parallel(1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
model_parallel_cuda_manual_seed(42)
model = build_model(
gpt_model_provider, True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size)
_logger.debug("building model")
assert isinstance(model, list)
assert len(model) == (1 or virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups)
if parallel_state.is_pipeline_last_stage():
_logger.debug("checking `word_embeddings` existence")
for m in model:
assert hasattr(m, "word_embeddings")
args = global_vars.get_args()
if virtual_pipeline_model_parallel_size is None:
batch = generate_batch(args.global_batch_size, args.seq_length)
else:
batch = [generate_batch(args.global_batch_size, args.seq_length) for _ in range(virtual_pipeline_model_parallel_size)]
_logger.debug("preparing batch")
if virtual_pipeline_model_parallel_size is None:
fwd_bwd_func = forward_backward_pipelining_without_interleaving
else:
fwd_bwd_func = _forward_backward_pipelining_with_interleaving
_logger.debug(f"selecting forward_backward func: {fwd_bwd_func}")
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
_logger.debug(f"`tensor_shape`: {tensor_shape}")
fwd_bwd_func(forward_step, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
_logger.debug(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
initialize_distributed()
args = global_vars.get_args()
args.padded_vocab_size = N_VOCAB
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
update_num_microbatches(0, True)
print_separator("run GPT model")
try:
run_gpt(torch.distributed.get_world_size())
# TODO(mkozuki): handle exception correctly, but for now, lazily commenting out as
# this won't get kicked by CI
except Exception as e:
_logger.debug(str(e))
pass
finally:
parallel_state.destroy_model_parallel()
from typing import Optional, Union, List
import torch
import torch.nn as nn
import apex
from apex.transformer import parallel_state
from apex.transformer.pipeline_parallel import get_forward_backward_func
from apex.transformer.pipeline_parallel.schedules.common import _get_params_for_weight_decay_optimization
from apex.transformer.pipeline_parallel.schedules.common import build_model
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import forward_backward_no_pipelining
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import _forward_backward_pipelining_with_interleaving
from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import forward_backward_pipelining_without_interleaving
from apex.transformer.pipeline_parallel.utils import average_losses_across_data_parallel_group
from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator
from apex.transformer.pipeline_parallel.utils import update_num_microbatches
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import print_separator
from apex.transformer.log_util import get_transformer_logger, set_logging_level
# set_logging_level("INFO")
_logger = get_transformer_logger("pipeline_parallel_test")
global_vars.set_global_variables()
batch_size, micro_batch_size = None, None
hidden_size = 16
fwd_bwd_functions = {
"no_pipelining": forward_backward_no_pipelining,
"no_interleaving": forward_backward_pipelining_without_interleaving,
"interleaving": _forward_backward_pipelining_with_interleaving,
}
# note (mkozuki): `pre_process` and `post_process` are a placeholder until interleaving schedule test comes.
class MyLayer(nn.Module):
def __init__(self, pre_process: bool, post_process: bool):
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = nn.Linear(hidden_size, hidden_size)
def forward(self, x):
return self.layer(x)
class MyModel(nn.Module):
def __init__(self, pre_process: bool = False, post_process: bool = False) -> None:
super().__init__()
self.pre_process = pre_process
self.post_process = post_process
self.layer = MyLayer(pre_process=pre_process, post_process=post_process)
self.input_tensor = None
def set_input_tensor(self, input_tensor: Union[torch.Tensor, List[torch.Tensor]]) -> None:
self.input_tensor = input_tensor
def forward(self, x: Optional[torch.Tensor]) -> torch.Tensor:
if self.input_tensor is None:
return self.layer(x)
return self.layer(self.input_tensor)
def model_provider_func(pre_process, post_process) -> MyModel:
return MyModel(pre_process, post_process)
def process_batch(batch):
if isinstance(batch, list):
x = batch[0]
else:
x = batch
return x
def fwd_step_func(batch, model):
x = process_batch(batch)
y = model(x)
# note (mkozuki): I don't think this function is nice but I do think this is enough for now
# just to check the sanity of ported pipeline functions.
def loss_func(x):
loss = torch.sum(x)
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'avg': averaged_loss}
return y, loss_func
# TODO (mkozuki): Add a case with `autocast` and `GradScaler`.
# Run forward & backward for one minibatch.
def forward_backward_func_template(
name: str,
forward_backward_func,
pipeline_model_parallel_size: int,
forward_only: bool,
) -> None:
print_separator(f"name: {name}, pipeline model parallel size: {pipeline_model_parallel_size}")
virtual_pipeline_model_parallel_size = 2 if name == "interleaving" else None
if name == "no_pipelining":
# note (mkozuki): `forward_backward_no_pipelining` is **NOTE** compatible with
# pipeline_model_parallel_size>1. So use pipeline_model_parallel_size as
# tensor_model_parallel_size and set pipeline_model_parallel_size to 1.
parallel_state.initialize_model_parallel(1, 1, None)
else:
# NOTE (mkozuki): `virtual_pipeline_model_parallel_size` is necessary to enable interleaving scheduling
# In megatron, `args.virtual_pipeline_model_parallel_size` is computed in megatron/arguments.py and
# used ubiquitously but this test uses custom model so it's safe to abuse.
parallel_state.initialize_model_parallel(
1, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size)
if virtual_pipeline_model_parallel_size is not None:
# Check the experimental warning message
get_forward_backward_func(virtual_pipeline_model_parallel_size, pipeline_model_parallel_size)
pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
model = build_model(
model_provider_func,
wrap_with_ddp=True,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
assert isinstance(model, list)
assert len(model) == (1 if virtual_pipeline_model_parallel_size is None else virtual_pipeline_model_parallel_size)
_param_groups = _get_params_for_weight_decay_optimization(model)
torch.optim.Adam(_param_groups, lr=1e-4)
tensor_shape = [batch_size // parallel_state.get_data_parallel_world_size(), hidden_size]
batch = (torch.randn(tensor_shape).cuda(),)
tensor_shape[0] = micro_batch_size
update_num_microbatches(0)
forward_backward_func(
fwd_step_func, batch, model, forward_only=forward_only, tensor_shape=tensor_shape)
if not forward_only:
for m in model:
for p in m.parameters():
if p.grad is None:
raise RuntimeError("grad not found")
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == "__main__":
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
n_tests = 0
failures = []
initialize_distributed()
world_size = torch.distributed.get_world_size()
args = global_vars.get_args()
batch_size = args.global_batch_size
micro_batch_size = args.micro_batch_size
setup_microbatch_calculator(
args.rank,
args.rampup_batch_size,
args.global_batch_size,
args.micro_batch_size,
1, # args.data_parallel_size,
)
for forward_only in (True, False):
for name, forward_backward_func in fwd_bwd_functions.items():
n_tests += 1
# TODO (mkozuki): Test with data parallel size > 1.
pipeline_model_parallel_size = world_size
try:
forward_backward_func_template(
name,
forward_backward_func,
pipeline_model_parallel_size,
forward_only,
)
except Exception as e:
failures.append(
f"\t# {name} failed with pipeline size: {pipeline_model_parallel_size} "
f"and forward_only: {forward_only}\n"
f"pipeline rank: {parallel_state.get_pipeline_model_parallel_rank()}, "
f"virtual pipeline rank: {parallel_state.get_virtual_pipeline_model_parallel_rank()}\n"
f"{str(e)}"
)
finally:
parallel_state.destroy_model_parallel()
else:
print_separator(f"{name} works")
print_separator("TEST RESULT")
if failures:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("\n".join(failures))
msg = f"{len(failures)} / {n_tests} cases failed"
raise RuntimeError(msg)
else:
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print("### PASS!")
# coding=utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from apex.transformer import parallel_state
from apex.transformer import tensor_parallel
from apex.transformer.testing import global_vars
from apex.transformer.testing.commons import print_separator
from apex.transformer.testing.commons import initialize_distributed
from apex.transformer.testing.commons import TEST_SUCCESS_MESSAGE
global_vars.set_global_variables()
def test_set_cuda_rng_state(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing set_rng_state with size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
size = 123
seed = 1234
torch.cuda.manual_seed(seed)
tensor = torch.cuda.FloatTensor(size)
# Get the state
rng_state = torch.cuda.get_rng_state()
rng_state_copy = rng_state.clone()
# Do some stuff.
for _ in range(5):
torch.randn(size, out=tensor)
result_1 = tensor.clone()
assert rng_state.sub(rng_state_copy).max() == 0
assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
# State should be different.
new_rng_state = torch.cuda.get_rng_state()
max_diff = new_rng_state.sub(rng_state).max()
print(' max diff in rng state (should be non-zero) on global rank {}: {}'.
format(torch.distributed.get_rank(), max_diff))
assert max_diff > 0
# Reset the rng state and do the same stuff.
tensor_parallel.random._set_cuda_rng_state(rng_state)
for _ in range(5):
torch.randn(size, out=tensor)
tensor_parallel.random._set_cuda_rng_state(rng_state)
for _ in range(5):
torch.randn(size, out=tensor)
result_2 = tensor.clone()
# Results should be the same
error = result_2.sub(result_1).abs().max()
print(' max error in generated tensors (should be zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Input state should have remained intact.
error = rng_state.sub(rng_state_copy).max()
print(' max error in rng state (should be zero) on global rank {}: {}'.
format(torch.distributed.get_rank(), error))
assert error == 0
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_cuda_rng_tracker(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print('> testing cuda rng tracker with size {} ...'.
format(tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
seed_1 = 1234
seed_2 = 4321
size = [12, 21]
tensor = torch.cuda.FloatTensor(size)
# Set to seed_1 and generate two tensors.
torch.cuda.manual_seed(seed_1)
torch.randn(size, out=tensor)
target_11 = tensor.clone()
torch.randn(size, out=tensor)
target_12 = tensor.clone()
# Set to seed_2 and generate two tensors.
torch.cuda.manual_seed(seed_2)
torch.randn(size, out=tensor)
target_21 = tensor.clone()
torch.randn(size, out=tensor)
target_22 = tensor.clone()
# Now if we interleave seed_1 and seed_2,
# we should still get the same tensors
torch.cuda.manual_seed(seed_1)
tensor_parallel.random.get_cuda_rng_tracker().add('test', seed_2)
torch.randn(size, out=tensor)
result_11 = tensor.clone()
with tensor_parallel.random.get_cuda_rng_tracker().fork('test'):
torch.randn(size, out=tensor)
result_21 = tensor.clone()
torch.randn(size, out=tensor)
result_12 = tensor.clone()
with tensor_parallel.random.get_cuda_rng_tracker().fork('test'):
torch.randn(size, out=tensor)
result_22 = tensor.clone()
diff = result_11.sub(result_21).abs().max()
diff = min(diff, result_12.sub(result_22).abs().max())
print(' max diff in generated tensors (should be non-zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
assert diff > 1.0e-6
error = max(result_11.sub(target_11).abs().max(),
result_12.sub(target_12).abs().max())
error = max(error, result_21.sub(target_21).abs().max())
error = max(error, result_22.sub(target_22).abs().max())
print(' max error in generated tensors (should be zero) on '
'global rank {}: {}'.format(torch.distributed.get_rank(), error))
assert error < 1.0e-6
# Reset the tracker
tensor_parallel.random.get_cuda_rng_tracker().reset()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
if torch.distributed.get_rank() == 0:
print(
'> testing model parallel cuda manual seed with size {} ...'.format(
tensor_model_parallel_size))
parallel_state.initialize_model_parallel(tensor_model_parallel_size)
tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
tensor_parallel.random.model_parallel_cuda_manual_seed(12345)
assert torch.cuda.initial_seed() == 12345
with tensor_parallel.random.get_cuda_rng_tracker().fork():
assert (
torch.cuda.initial_seed() ==
12345 + 2718 + parallel_state.get_tensor_model_parallel_rank()
)
# Reset the tracker
tensor_parallel.random.get_cuda_rng_tracker().reset()
# Reset groups
parallel_state.destroy_model_parallel()
torch.distributed.barrier()
if torch.distributed.get_rank() == 0:
print(TEST_SUCCESS_MESSAGE)
if __name__ == '__main__':
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
initialize_distributed()
world_size = torch.distributed.get_world_size()
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test set rng state')
test_set_cuda_rng_state(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test cuda rng tracker')
test_cuda_rng_tracker(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
tensor_model_parallel_size = 1
while tensor_model_parallel_size <= world_size:
print_separator('test model parallel cuda manual seed')
test_model_parallel_cuda_manual_seed(tensor_model_parallel_size)
tensor_model_parallel_size *= 2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment