Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
e59ba12d
".github/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "a59f9990fc1251ada20e86dfa615b04360bd013f"
Unverified
Commit
e59ba12d
authored
Jan 20, 2021
by
Shaden Smith
Committed by
GitHub
Jan 20, 2021
Browse files
make test_pipe more stable (#683)
parent
7b0bee0b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
22 deletions
+40
-22
tests/unit/test_pipe.py
tests/unit/test_pipe.py
+40
-22
No files found.
tests/unit/test_pipe.py
View file @
e59ba12d
import
os
import
os
import
copy
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
...
@@ -13,8 +14,7 @@ import deepspeed.runtime.utils as ds_utils
...
@@ -13,8 +14,7 @@ import deepspeed.runtime.utils as ds_utils
from
deepspeed.runtime.pipe.topology
import
PipeDataParallelTopology
,
PipeModelDataParallelTopology
from
deepspeed.runtime.pipe.topology
import
PipeDataParallelTopology
,
PipeModelDataParallelTopology
PipeTopo
=
PipeDataParallelTopology
PipeTopo
=
PipeDataParallelTopology
import
deepspeed.runtime.pipe.module
as
PipelineModule
from
deepspeed.runtime.pipe.module
import
PipelineModule
,
LayerSpec
from
deepspeed.runtime.pipe.module
import
LayerSpec
from
common
import
distributed_test
from
common
import
distributed_test
...
@@ -74,7 +74,13 @@ class AlexNet(nn.Module):
...
@@ -74,7 +74,13 @@ class AlexNet(nn.Module):
return
self
.
loss_fn
(
x
,
y
)
return
self
.
loss_fn
(
x
,
y
)
class
AlexNetPipe
(
PipelineModule
.
PipelineModule
):
class
AlexNetPipe
(
AlexNet
):
def
to_layers
(
self
):
layers
=
[
*
self
.
features
,
lambda
x
:
x
.
view
(
x
.
size
(
0
),
-
1
),
self
.
classifier
]
return
layers
class
AlexNetPipeSpec
(
PipelineModule
):
def
__init__
(
self
,
num_classes
=
10
,
**
kwargs
):
def
__init__
(
self
,
num_classes
=
10
,
**
kwargs
):
self
.
num_classes
=
num_classes
self
.
num_classes
=
num_classes
specs
=
[
specs
=
[
...
@@ -135,6 +141,9 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
...
@@ -135,6 +141,9 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
with
torch
.
random
.
fork_rng
(
devices
=
[
torch
.
cuda
.
current_device
()]):
with
torch
.
random
.
fork_rng
(
devices
=
[
torch
.
cuda
.
current_device
()]):
ds_utils
.
set_random_seed
(
seed
)
ds_utils
.
set_random_seed
(
seed
)
# disable dropout
model
.
eval
()
trainset
=
cifar_trainset
(
fp16
=
fp16
)
trainset
=
cifar_trainset
(
fp16
=
fp16
)
args
.
local_rank
=
dist
.
get_rank
()
args
.
local_rank
=
dist
.
get_rank
()
...
@@ -148,7 +157,7 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
...
@@ -148,7 +157,7 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
for
step
in
range
(
num_steps
):
for
step
in
range
(
num_steps
):
loss
=
engine
.
train_batch
()
loss
=
engine
.
train_batch
()
losses
.
append
(
loss
.
item
())
losses
.
append
(
loss
.
item
())
if
step
%
50
==
0
:
if
step
%
50
==
0
and
dist
.
get_rank
()
==
0
:
print
(
f
'STEP=
{
step
}
LOSS=
{
loss
.
item
()
}
'
)
print
(
f
'STEP=
{
step
}
LOSS=
{
loss
.
item
()
}
'
)
if
average_dp_losses
:
if
average_dp_losses
:
...
@@ -160,18 +169,16 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
...
@@ -160,18 +169,16 @@ def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, s
return
losses
return
losses
@
pytest
.
mark
.
parametrize
(
'
base_topo,test_
topo'
,
@
pytest
.
mark
.
parametrize
(
'topo'
,
[
[
(
PipeTopo
(
num_pp
=
1
,
PipeTopo
(
num_pp
=
1
,
num_dp
=
4
),
num_dp
=
4
),
PipeTopo
(
num_pp
=
2
,
PipeTopo
(
num_pp
=
2
,
num_dp
=
2
)),
num_dp
=
2
),
(
PipeTopo
(
num_pp
=
1
,
PipeTopo
(
num_pp
=
4
,
num_dp
=
4
),
num_dp
=
1
),
PipeTopo
(
num_pp
=
4
,
num_dp
=
1
)),
])
])
def
test_pipe_cifar10
_seedlayers
(
base_topo
,
test_
topo
,
tmpdir
):
def
test_pipe_cifar10
(
topo
,
tmpdir
):
config_dict
=
{
config_dict
=
{
"train_batch_size"
:
16
,
"train_batch_size"
:
16
,
"train_micro_batch_size_per_gpu"
:
4
,
"train_micro_batch_size_per_gpu"
:
4
,
...
@@ -199,21 +206,32 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
...
@@ -199,21 +206,32 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
}
}
args
=
args_from_dict
(
tmpdir
,
config_dict
)
args
=
args_from_dict
(
tmpdir
,
config_dict
)
# Allocate model for consistent initial weights.
init_net
=
AlexNetPipe
()
@
distributed_test
(
world_size
=
4
)
@
distributed_test
(
world_size
=
4
)
def
_helper
(
base_topo
,
test_
topo
,
tmpdir
,
steps
=
500
):
def
_helper
(
topo
,
tmpdir
,
steps
=
500
):
assert
steps
>=
100
assert
steps
>=
100
base_model
=
AlexNetPipe
(
num_classes
=
10
,
base_net
=
copy
.
deepcopy
(
init_net
)
topology
=
base_topo
,
base_model
=
PipelineModule
(
layers
=
base_net
.
to_layers
(),
seed_layers
=
config_dict
[
'pipeline'
][
'seed_layers'
])
num_stages
=
1
,
loss_fn
=
nn
.
CrossEntropyLoss
())
# Train with just data parallelism
base_losses
=
train_cifar
(
base_model
,
base_losses
=
train_cifar
(
base_model
,
args
,
args
,
num_steps
=
steps
,
num_steps
=
steps
,
fp16
=
config_dict
[
'fp16'
][
'enabled'
])
fp16
=
config_dict
[
'fp16'
][
'enabled'
])
test_model
=
AlexNetPipe
(
num_classes
=
10
,
test_net
=
copy
.
deepcopy
(
init_net
)
topology
=
test_topo
,
test_model
=
PipelineModule
(
layers
=
test_net
.
to_layers
(),
seed_layers
=
config_dict
[
'pipeline'
][
'seed_layers'
])
topology
=
topo
,
loss_fn
=
nn
.
CrossEntropyLoss
())
#test_model = AlexNetPipe(num_classes=10,
# topology=test_topo,
# seed_layers=config_dict['pipeline']['seed_layers'])
test_losses
=
train_cifar
(
test_model
,
test_losses
=
train_cifar
(
test_model
,
args
,
args
,
num_steps
=
steps
,
num_steps
=
steps
,
...
@@ -246,4 +264,4 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
...
@@ -246,4 +264,4 @@ def test_pipe_cifar10_seedlayers(base_topo, test_topo, tmpdir):
test_avg
=
sum
(
test
)
/
len
(
test
)
test_avg
=
sum
(
test
)
/
len
(
test
)
assert
rel_diff
(
base_avg
,
test_avg
)
<
0.03
assert
rel_diff
(
base_avg
,
test_avg
)
<
0.03
_helper
(
base_topo
,
test_
topo
,
tmpdir
)
_helper
(
topo
,
tmpdir
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment