Unverified Commit 541da7a0 authored by Hubert Lu's avatar Hubert Lu Committed by GitHub
Browse files

Merge pull request #58 from ROCmSoftwarePlatform/dev/hubertlu/unit_tests

Add more unit tests for both distributed and extensions
parents 08e88b1b 2228f1bf
...@@ -160,7 +160,7 @@ class SelfMultiheadAttn(nn.Module): ...@@ -160,7 +160,7 @@ class SelfMultiheadAttn(nn.Module):
outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, self.scaling, lyr_nrm_results, outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, self.scaling, lyr_nrm_results,
input_weights, self.out_proj_weight, input_weights, self.out_proj_weight,
input_bias, self.out_proj_bias, input_bias, self.out_proj_bias,
mask, self.dropout) mask, self.mask_additive, self.dropout)
if is_training: if is_training:
outputs = jit_dropout_add(outputs, query, self.dropout, is_training) outputs = jit_dropout_add(outputs, query, self.dropout, is_training)
else: else:
......
import unittest
import sys
test_dirs = ["groupbn", "layer_norm", "multihead_attn", "."] # "." for test_label_smoothing.py
ROCM_BLACKLIST = [
"groupbn",
"layer_norm"
]
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
for test_dir in test_dirs:
if test_dir in ROCM_BLACKLIST:
continue
suite = unittest.TestLoader().discover(test_dir)
print("\nExecuting tests from " + test_dir)
result = runner.run(suite)
if not result.wasSuccessful():
errcode = 1
sys.exit(errcode)
#!/bin/bash #!/bin/bash
APEX_TEST_WITH_ROCM=1 python3.6 run_test.py APEX_TEST_WITH_ROCM=1 python run_test.py
...@@ -6,8 +6,8 @@ export WORLD_SIZE=2 ...@@ -6,8 +6,8 @@ export WORLD_SIZE=2
# Test with opt_level="O2" # Test with opt_level="O2"
echo "running opt_level O2" echo "running opt_level O2"
python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2" python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O2"
python3.6 amp_master_params/compare.py python amp_master_params/compare.py
# delete the model files # delete the model files
echo -e "O2 test completed. Deleting model files\n" echo -e "O2 test completed. Deleting model files\n"
...@@ -19,9 +19,9 @@ rm rank1master.pth ...@@ -19,9 +19,9 @@ rm rank1master.pth
# Test with opt_level="O5" # Test with opt_level="O5"
#echo "running opt_level O5" #echo "running opt_level O5"
#python3.6 -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5" #python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python3.6 amp_master_params/compare.py #python amp_master_params/compare.py
#
## delete the model files ## delete the model files
#echo "O5 test completed. Deleting model files" #echo "O5 test completed. Deleting model files"
#rm rank0model.pth #rm rank0model.pth
...@@ -31,7 +31,16 @@ rm rank1master.pth ...@@ -31,7 +31,16 @@ rm rank1master.pth
## Run the Sync BN Tests. ## Run the Sync BN Tests.
echo "Running syncbn tests" echo "Running syncbn tests"
python3.6 -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_unit_test.py --fp16
python -m torch.distributed.launch --nproc_per_node=2 synced_batchnorm/two_gpu_test_different_batch_size.py --apex
echo "Running syncbn python only tests" echo "Running syncbn python only tests"
python3.6 synced_batchnorm/python_single_gpu_unit_test.py python synced_batchnorm/python_single_gpu_unit_test.py
echo "Running syncbn batchnorm1d tests"
python synced_batchnorm/test_batchnorm1d.py
#beware, you need a system with at least 4 gpus to test group_size<world_size (currently fail both on upstream and rocm fork)
#python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
## Run the DDP Tests
echo "running DDP tests"
HIP_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 DDP/ddp_race_condition_test.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment