Commit 6f7a8b39 authored by lcskrishna's avatar lcskrishna
Browse files

Merge remote-tracking branch 'rocm_upstream/master' into ifu_07272020

parents 459de22d 9c80f6d3
...@@ -9,6 +9,7 @@ parser = argparse.ArgumentParser() ...@@ -9,6 +9,7 @@ parser = argparse.ArgumentParser()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied # FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch. # automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int) parser.add_argument("--local_rank", default=0, type=int)
parser.add_argument("--opt_level", default="O2", type=str)
args = parser.parse_args() args = parser.parse_args()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch, # FOR DISTRIBUTED: If we are running under torch.distributed.launch,
...@@ -42,7 +43,7 @@ y = torch.randn(N, D_out, device='cuda') ...@@ -42,7 +43,7 @@ y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Linear(D_in, D_out).cuda() model = torch.nn.Linear(D_in, D_out).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2") model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level)
if args.distributed: if args.distributed:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with # FOR DISTRIBUTED: After amp.initialize, wrap the model with
......
...@@ -14,6 +14,9 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip( ...@@ -14,6 +14,9 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
model_params_rank1, model_params_rank1,
master_params_rank0, master_params_rank0,
master_params_rank1): master_params_rank1):
# converting model params to float is a hack since allclose doesn't support bfloat16 yet.
model_rank0 = model_rank0.float()
model_rank1 = model_rank1.float()
assert torch.allclose(model_rank0, model_rank1), "Model param mismatch" assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
assert torch.allclose(master_rank0, master_rank1), "Master param mismatch" assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
# Some debugging/investigation assistance code: # Some debugging/investigation assistance code:
...@@ -23,6 +26,6 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip( ...@@ -23,6 +26,6 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(), # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# offending_val_float.half().item()) # offending_val_float.half().item())
# rtol needs to be > 2^-11 because of denormals... # rtol needs to be > 2^-11 because of denormals...
assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch" assert torch.allclose(model_rank0, master_rank0, rtol=.005), "Model-master mismatch"
print("OK: Model and master params match across ranks.") print("OK: Model and master params match across ranks.")
#!/bin/bash
set -e
# To run the test on 2 gpus
export WORLD_SIZE=2
# Test with opt_level="O2"
echo "running opt_level O2"
python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
python3.6 amp_master_params/compare.py
# delete the model files
echo -e "O2 test completed. Deleting model files\n"
rm rank0model.pth
rm rank1model.pth
rm rank0master.pth
rm rank1master.pth
# Test with opt_level="O5"
echo "running opt_level O5"
python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
python3.6 amp_master_params/compare.py
# delete the model files
echo "O5 test completed. Deleting model files"
rm rank0model.pth
rm rank1model.pth
rm rank0master.pth
rm rank1master.pth
## Run the Sync BN Tests.
echo "Running syncbn tests"
python3.6 -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
echo "Running syncbn python only tests"
python3.6 synced_batchnorm/python_single_gpu_unit_test.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment