Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
541da7a0
Unverified
Commit
541da7a0
authored
Dec 02, 2021
by
Hubert Lu
Committed by
GitHub
Dec 02, 2021
Browse files
Merge pull request #58 from ROCmSoftwarePlatform/dev/hubertlu/unit_tests
Add more unit tests for both distributed and extensions
parents
08e88b1b
2228f1bf
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
9 deletions
+45
-9
apex/contrib/multihead_attn/self_multihead_attn.py
apex/contrib/multihead_attn/self_multihead_attn.py
+1
-1
apex/contrib/test/run_rocm_extensions.py
apex/contrib/test/run_rocm_extensions.py
+27
-0
tests/L0/run_rocm.sh
tests/L0/run_rocm.sh
+1
-1
tests/distributed/run_rocm_distributed.sh
tests/distributed/run_rocm_distributed.sh
+16
-7
No files found.
apex/contrib/multihead_attn/self_multihead_attn.py
View file @
541da7a0
...
@@ -160,7 +160,7 @@ class SelfMultiheadAttn(nn.Module):
...
@@ -160,7 +160,7 @@ class SelfMultiheadAttn(nn.Module):
outputs
=
self
.
attn_func
(
attn_mask
is
not
None
,
is_training
,
self
.
num_heads
,
self
.
scaling
,
lyr_nrm_results
,
outputs
=
self
.
attn_func
(
attn_mask
is
not
None
,
is_training
,
self
.
num_heads
,
self
.
scaling
,
lyr_nrm_results
,
input_weights
,
self
.
out_proj_weight
,
input_weights
,
self
.
out_proj_weight
,
input_bias
,
self
.
out_proj_bias
,
input_bias
,
self
.
out_proj_bias
,
mask
,
self
.
dropout
)
mask
,
self
.
mask_additive
,
self
.
dropout
)
if
is_training
:
if
is_training
:
outputs
=
jit_dropout_add
(
outputs
,
query
,
self
.
dropout
,
is_training
)
outputs
=
jit_dropout_add
(
outputs
,
query
,
self
.
dropout
,
is_training
)
else
:
else
:
...
...
apex/contrib/test/run_rocm_extensions.py
0 → 100644
View file @
541da7a0
import
unittest
import
sys
test_dirs
=
[
"groupbn"
,
"layer_norm"
,
"multihead_attn"
,
"."
]
# "." for test_label_smoothing.py
ROCM_BLACKLIST
=
[
"groupbn"
,
"layer_norm"
]
runner
=
unittest
.
TextTestRunner
(
verbosity
=
2
)
errcode
=
0
for
test_dir
in
test_dirs
:
if
test_dir
in
ROCM_BLACKLIST
:
continue
suite
=
unittest
.
TestLoader
().
discover
(
test_dir
)
print
(
"
\n
Executing tests from "
+
test_dir
)
result
=
runner
.
run
(
suite
)
if
not
result
.
wasSuccessful
():
errcode
=
1
sys
.
exit
(
errcode
)
tests/L0/run_rocm.sh
View file @
541da7a0
#!/bin/bash
#!/bin/bash
APEX_TEST_WITH_ROCM
=
1 python
3.6
run_test.py
APEX_TEST_WITH_ROCM
=
1 python run_test.py
tests/distributed/run_rocm_distributed.sh
View file @
541da7a0
...
@@ -6,8 +6,8 @@ export WORLD_SIZE=2
...
@@ -6,8 +6,8 @@ export WORLD_SIZE=2
# Test with opt_level="O2"
# Test with opt_level="O2"
echo
"running opt_level O2"
echo
"running opt_level O2"
python
3.6
-m
torch.distributed.launch
--nproc_per_node
=
2 amp_master_params/amp_master_params.py
--opt_level
"O2"
python
-m
torch.distributed.launch
--nproc_per_node
=
2 amp_master_params/amp_master_params.py
--opt_level
"O2"
python
3.6
amp_master_params/compare.py
python amp_master_params/compare.py
# delete the model files
# delete the model files
echo
-e
"O2 test completed. Deleting model files
\n
"
echo
-e
"O2 test completed. Deleting model files
\n
"
...
@@ -19,9 +19,9 @@ rm rank1master.pth
...
@@ -19,9 +19,9 @@ rm rank1master.pth
# Test with opt_level="O5"
# Test with opt_level="O5"
#echo "running opt_level O5"
#echo "running opt_level O5"
#python
3.6
-m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python -m torch.distributed.launch --nproc_per_node=2 amp_master_params/amp_master_params.py --opt_level "O5"
#python
3.6
amp_master_params/compare.py
#python amp_master_params/compare.py
#
## delete the model files
## delete the model files
#echo "O5 test completed. Deleting model files"
#echo "O5 test completed. Deleting model files"
#rm rank0model.pth
#rm rank0model.pth
...
@@ -31,7 +31,16 @@ rm rank1master.pth
...
@@ -31,7 +31,16 @@ rm rank1master.pth
## Run the Sync BN Tests.
## Run the Sync BN Tests.
echo
"Running syncbn tests"
echo
"Running syncbn tests"
python3.6
-m
torch.distributed.launch
--nproc_per_node
=
2 synced_batchnorm/two_gpu_test_different_batch_size.py
--apex
python
-m
torch.distributed.launch
--nproc_per_node
=
2 synced_batchnorm/two_gpu_unit_test.py
python
-m
torch.distributed.launch
--nproc_per_node
=
2 synced_batchnorm/two_gpu_unit_test.py
--fp16
python
-m
torch.distributed.launch
--nproc_per_node
=
2 synced_batchnorm/two_gpu_test_different_batch_size.py
--apex
echo
"Running syncbn python only tests"
echo
"Running syncbn python only tests"
python3.6 synced_batchnorm/python_single_gpu_unit_test.py
python synced_batchnorm/python_single_gpu_unit_test.py
echo
"Running syncbn batchnorm1d tests"
python synced_batchnorm/test_batchnorm1d.py
#beware, you need a system with at least 4 gpus to test group_size<world_size (currently fail both on upstream and rocm fork)
#python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
## Run the DDP Tests
echo
"running DDP tests"
HIP_VISIBLE_DEVICES
=
0,1 python
-m
torch.distributed.launch
--nproc_per_node
=
2 DDP/ddp_race_condition_test.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment