Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
6f7a8b39
Commit
6f7a8b39
authored
Jul 27, 2020
by
lcskrishna
Browse files
Merge remote-tracking branch 'rocm_upstream/master' into ifu_07272020
parents
459de22d
9c80f6d3
Changes
63
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
2 deletions
+43
-2
tests/distributed/amp_master_params/amp_master_params.py
tests/distributed/amp_master_params/amp_master_params.py
+2
-1
tests/distributed/amp_master_params/compare.py
tests/distributed/amp_master_params/compare.py
+4
-1
tests/distributed/run_rocm_distributed.sh
tests/distributed/run_rocm_distributed.sh
+37
-0
No files found.
tests/distributed/amp_master_params/amp_master_params.py
View file @
6f7a8b39
...
@@ -9,6 +9,7 @@ parser = argparse.ArgumentParser()
...
@@ -9,6 +9,7 @@ parser = argparse.ArgumentParser()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
# automatically by torch.distributed.launch.
parser
.
add_argument
(
"--local_rank"
,
default
=
0
,
type
=
int
)
parser
.
add_argument
(
"--local_rank"
,
default
=
0
,
type
=
int
)
parser
.
add_argument
(
"--opt_level"
,
default
=
"O2"
,
type
=
str
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
...
@@ -42,7 +43,7 @@ y = torch.randn(N, D_out, device='cuda')
...
@@ -42,7 +43,7 @@ y = torch.randn(N, D_out, device='cuda')
model
=
torch
.
nn
.
Linear
(
D_in
,
D_out
).
cuda
()
model
=
torch
.
nn
.
Linear
(
D_in
,
D_out
).
cuda
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
1e-3
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
1e-3
)
model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
"O2"
)
model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
args
.
opt_level
)
if
args
.
distributed
:
if
args
.
distributed
:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
...
...
tests/distributed/amp_master_params/compare.py
View file @
6f7a8b39
...
@@ -14,6 +14,9 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
...
@@ -14,6 +14,9 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
model_params_rank1
,
model_params_rank1
,
master_params_rank0
,
master_params_rank0
,
master_params_rank1
):
master_params_rank1
):
# converting model params to float is a hack since allclose doesn't support bfloat16 yet.
model_rank0
=
model_rank0
.
float
()
model_rank1
=
model_rank1
.
float
()
assert
torch
.
allclose
(
model_rank0
,
model_rank1
),
"Model param mismatch"
assert
torch
.
allclose
(
model_rank0
,
model_rank1
),
"Model param mismatch"
assert
torch
.
allclose
(
master_rank0
,
master_rank1
),
"Master param mismatch"
assert
torch
.
allclose
(
master_rank0
,
master_rank1
),
"Master param mismatch"
# Some debugging/investigation assistance code:
# Some debugging/investigation assistance code:
...
@@ -23,6 +26,6 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
...
@@ -23,6 +26,6 @@ for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# offending_val_float.half().item())
# offending_val_float.half().item())
# rtol needs to be > 2^-11 because of denormals...
# rtol needs to be > 2^-11 because of denormals...
assert
torch
.
allclose
(
model_rank0
,
master_rank0
.
half
()
,
rtol
=
.
005
),
"Model-master mismatch"
assert
torch
.
allclose
(
model_rank0
,
master_rank0
,
rtol
=
.
005
),
"Model-master mismatch"
print
(
"OK: Model and master params match across ranks."
)
print
(
"OK: Model and master params match across ranks."
)
tests/distributed/run_rocm_distributed.sh
0 → 100644
View file @
6f7a8b39
#!/bin/bash
set
-e
# To run the test on 2 gpus
export
WORLD_SIZE
=
2
# Test with opt_level="O2"
echo
"running opt_level O2"
python3.6
-m
torch.distributed.launch
--nproc_per_node
=
2 amp_master_params/amp_master_params.py
--opt_level
"O2"
python3.6 amp_master_params/compare.py
# delete the model files
echo
-e
"O2 test completed. Deleting model files
\n
"
rm
rank0model.pth
rm
rank1model.pth
rm
rank0master.pth
rm
rank1master.pth
# Test with opt_level="O5"
echo
"running opt_level O5"
python3.6
-m
torch.distributed.launch
--nproc_per_node
=
2 amp_master_params/amp_master_params.py
--opt_level
"O5"
python3.6 amp_master_params/compare.py
# delete the model files
echo
"O5 test completed. Deleting model files"
rm
rank0model.pth
rm
rank1model.pth
rm
rank0master.pth
rm
rank1master.pth
## Run the Sync BN Tests.
echo
"Running syncbn tests"
python3.6
-m
torch.distributed.launch
--nproc_per_node
=
2 synced_batchnorm/two_gpu_test_different_batch_size.py
--apex
echo
"Running syncbn python only tests"
python3.6 synced_batchnorm/python_single_gpu_unit_test.py
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment