Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
fairscale
Commits
10062e58
Unverified
Commit
10062e58
authored
Oct 16, 2020
by
Benjamin Lefaudeux
Committed by
GitHub
Oct 16, 2020
Browse files
[feat][minor] OSS: benchmark - adding a cpu option (#144)
* adding a cpu option * adjust the reference loss
parent
61234360
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
68 deletions
+28
-68
.circleci/config.yml
.circleci/config.yml
+1
-1
benchmarks/oss.py
benchmarks/oss.py
+27
-67
No files found.
.circleci/config.yml
View file @
10062e58
...
...
@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark
-
run
:
name
:
Run OSS Benchmark
command
:
|
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.
15
2
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.
30
2
run_oss_gloo
:
&run_oss_gloo
-
run
:
...
...
benchmarks/oss.py
View file @
10062e58
...
...
@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend):
dist
.
init_process_group
(
backend
=
backend
,
init_method
=
"tcp://localhost:29501"
,
rank
=
rank
,
world_size
=
world_size
)
def
get_problem
(
rank
,
data_size
,
batch_size
):
def
get_problem
(
rank
,
data_size
,
batch_size
,
device
):
# Standard RN101
model
=
resnet101
(
pretrained
=
False
,
progress
=
True
).
to
(
rank
)
model
=
resnet101
(
pretrained
=
False
,
progress
=
True
).
to
(
device
)
# Data setup, dummy data
def
collate
(
inputs
:
List
[
Any
]):
return
{
"inputs"
:
torch
.
stack
([
i
[
0
]
for
i
in
inputs
]).
to
(
torch
.
device
(
rank
)
),
"label"
:
torch
.
stack
([
i
[
1
]
for
i
in
inputs
]).
to
(
torch
.
device
(
rank
)
),
"inputs"
:
torch
.
stack
([
i
[
0
]
for
i
in
inputs
]).
to
(
device
),
"label"
:
torch
.
stack
([
i
[
1
]
for
i
in
inputs
]).
to
(
device
),
}
dataloader
=
DataLoader
(
...
...
@@ -59,20 +59,13 @@ class OptimType(str, Enum):
def
train
(
rank
:
int
,
world_size
:
int
,
num_epochs
:
int
=
10
,
batch_size
:
int
=
32
,
data_size
:
int
=
200
,
args
:
argparse
.
Namespace
,
backend
:
str
=
"gloo"
,
optim_type
:
OptimType
=
OptimType
.
vanilla
,
profile
:
bool
=
False
,
check_regression
:
bool
=
True
,
reference_speed
:
float
=
-
1.0
,
reference_memory
:
float
=
-
1.0
,
reference_loss
:
float
=
-
1.0
,
):
# DDP
dist_init
(
rank
=
rank
,
world_size
=
world_size
,
backend
=
backend
)
dist_init
(
rank
=
rank
,
world_size
=
args
.
world_size
,
backend
=
backend
)
# Setup
torch
.
cuda
.
set_device
(
rank
)
...
...
@@ -84,7 +77,8 @@ def train(
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
model
,
dataloader
,
loss_fn
=
get_problem
(
rank
,
data_size
,
batch_size
)
device
=
torch
.
device
(
"cpu"
)
if
args
.
cpu
else
torch
.
device
(
rank
)
model
,
dataloader
,
loss_fn
=
get_problem
(
rank
,
args
.
data_size
,
args
.
batch_size
,
device
)
# Shard the optimizer
optimizer
:
Optional
[
torch
.
optim
.
Optimizer
]
=
None
...
...
@@ -94,7 +88,7 @@ def train(
module
=
model
,
optimizer
=
OPTIM
,
optimizer_params
=
{
"lr"
:
1e-4
,
"momentum"
:
0.9
},
world_size
=
world_size
,
world_size
=
args
.
world_size
,
broadcast_buffers
=
True
,
)
ddp
.
train
()
...
...
@@ -109,18 +103,19 @@ def train(
)
# Reset the memory use counter
torch
.
cuda
.
reset_peak_memory_stats
(
rank
)
if
not
args
.
cpu
:
torch
.
cuda
.
reset_peak_memory_stats
(
rank
)
torch
.
cuda
.
synchronize
(
rank
)
# Dummy training loop
torch
.
cuda
.
synchronize
(
rank
)
training_start
=
time
.
monotonic
()
model
.
train
()
measurements
=
[]
final_loss
:
Optional
[
float
]
=
-
1.0
need_profiling
=
profile
need_profiling
=
args
.
profile
for
epoch
in
range
(
num_
epochs
):
for
epoch
in
range
(
args
.
epochs
):
epoch_start
=
time
.
monotonic
()
for
batch
in
dataloader
:
...
...
@@ -129,7 +124,6 @@ def train(
model
.
zero_grad
()
outputs
=
model
(
batch
[
"inputs"
])
loss
=
loss_fn
(
outputs
,
batch
[
"label"
])
loss
/=
world_size
loss
.
backward
()
if
optim_type
==
OptimType
.
oss_sdp
:
...
...
@@ -137,7 +131,7 @@ def train(
return
loss
if
need_profiling
:
if
need_profiling
and
not
args
.
cpu
:
print
(
"Profiling the run"
)
with
profiler
.
profile
(
use_cuda
=
True
)
as
prof
:
# type: ignore
with
profiler
.
record_function
(
"batch"
):
...
...
@@ -163,13 +157,14 @@ def train(
_
=
optimizer
.
state_dict
()
print
(
"... State dict collected"
)
measurements
.
append
(
data_size
/
(
epoch_end
-
epoch_start
))
measurements
.
append
(
args
.
data_size
/
(
epoch_end
-
epoch_start
))
if
dist
.
get_rank
()
==
0
:
print
(
f
"Epoch
{
epoch
}
- processed
{
measurements
[
-
1
]:.
2
f
}
img per sec. Loss
{
final_loss
:.
3
f
}
"
)
torch
.
cuda
.
synchronize
(
rank
)
if
not
args
.
cpu
:
torch
.
cuda
.
synchronize
(
rank
)
training_stop
=
time
.
monotonic
()
img_per_sec
=
data_size
/
(
training_stop
-
training_start
)
*
num_
epochs
img_per_sec
=
args
.
data_size
/
(
training_stop
-
training_start
)
*
args
.
epochs
max_memory
=
torch
.
cuda
.
max_memory_allocated
(
rank
)
/
2
**
20
print
(
f
"[
{
dist
.
get_rank
()
}
] : Training done.
{
img_per_sec
:.
2
f
}
img per sec overall"
)
...
...
@@ -182,9 +177,9 @@ def train(
print
(
f
"[
{
dist
.
get_rank
()
}
] : Mean speed:
{
mean
:.
2
f
}
+/-
{
std
:.
2
f
}
"
)
if
check_regression
and
dist
.
get_rank
()
==
0
:
assert
(
mean
+
3.0
*
std
)
>
reference_speed
,
"Speed regression detected"
assert
max_memory
<
1.05
*
reference_memory
,
"Memory use regression detected"
assert
abs
(
cast
(
float
,
final_loss
)
-
reference_loss
)
<
1e-3
,
"Loss regression detected"
assert
(
mean
+
3.0
*
std
)
>
args
.
reference_speed
,
"Speed regression detected"
assert
max_memory
<
1.05
*
args
.
reference_memory
,
"Memory use regression detected"
assert
abs
(
cast
(
float
,
final_loss
)
-
args
.
reference_loss
)
<
1e-3
,
"Loss regression detected"
print
(
"[Regression Test] VALID"
)
...
...
@@ -208,26 +203,18 @@ if __name__ == "__main__":
)
parser
.
add_argument
(
"--gloo"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--cpu"
,
action
=
"store_true"
,
default
=
False
)
args
=
parser
.
parse_args
()
print
(
f
"Benchmark arguments:
{
args
}
"
)
backend
=
"nccl"
if
not
args
.
gloo
or
not
torch
.
cuda
.
is_available
()
else
"gloo"
backend
=
"nccl"
if
(
not
args
.
gloo
or
not
torch
.
cuda
.
is_available
()
)
and
not
args
.
cpu
else
"gloo"
if
args
.
optim_type
==
OptimType
.
vanilla
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark vanilla optimizer"
)
mp
.
spawn
(
train
,
args
=
(
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
vanilla
,
args
.
profile
,
False
,
# no regression check
),
args
=
(
args
,
backend
,
OptimType
.
vanilla
,
False
,),
# no regression check
nprocs
=
args
.
world_size
,
join
=
True
,
)
...
...
@@ -235,41 +222,14 @@ if __name__ == "__main__":
if
args
.
optim_type
==
OptimType
.
oss
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark OSS with DDP"
)
mp
.
spawn
(
train
,
args
=
(
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
oss
,
args
.
profile
,
args
.
check_regression
,
args
.
reference_speed
,
args
.
reference_memory
,
args
.
reference_loss
,
),
nprocs
=
args
.
world_size
,
join
=
True
,
train
,
args
=
(
args
,
backend
,
OptimType
.
oss
,
args
.
check_regression
),
nprocs
=
args
.
world_size
,
join
=
True
,
)
if
args
.
optim_type
==
OptimType
.
oss_sdp
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark OSS with SDP"
)
mp
.
spawn
(
train
,
args
=
(
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
oss_sdp
,
args
.
profile
,
False
,
# FIXME: @lefaudeux - SDP should give the same results
-
1
,
# Not checking SDP for speed regression for now, still slower than OSS
args
.
reference_memory
,
args
.
reference_loss
,
),
args
=
(
args
,
backend
,
OptimType
.
oss_sdp
,
False
,),
# FIXME: @lefaudeux - SDP should give the same results
nprocs
=
args
.
world_size
,
join
=
True
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment