Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
fairscale
Commits
10062e58
Unverified
Commit
10062e58
authored
Oct 16, 2020
by
Benjamin Lefaudeux
Committed by
GitHub
Oct 16, 2020
Browse files
[feat][minor] OSS: benchmark - adding a cpu option (#144)
* adding a cpu option * adjust the reference loss
parent
61234360
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
68 deletions
+28
-68
.circleci/config.yml
.circleci/config.yml
+1
-1
benchmarks/oss.py
benchmarks/oss.py
+27
-67
No files found.
.circleci/config.yml
View file @
10062e58
...
@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark
...
@@ -108,7 +108,7 @@ run_oss_benchmark: &run_oss_benchmark
-
run
:
-
run
:
name
:
Run OSS Benchmark
name
:
Run OSS Benchmark
command
:
|
command
:
|
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.
15
2
python benchmarks/oss.py --check_regression --world_size 4 --reference_speed 13.7 --reference_memory 4390 --reference_loss 0.
30
2
run_oss_gloo
:
&run_oss_gloo
run_oss_gloo
:
&run_oss_gloo
-
run
:
-
run
:
...
...
benchmarks/oss.py
View file @
10062e58
...
@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend):
...
@@ -30,15 +30,15 @@ def dist_init(rank, world_size, backend):
dist
.
init_process_group
(
backend
=
backend
,
init_method
=
"tcp://localhost:29501"
,
rank
=
rank
,
world_size
=
world_size
)
dist
.
init_process_group
(
backend
=
backend
,
init_method
=
"tcp://localhost:29501"
,
rank
=
rank
,
world_size
=
world_size
)
def
get_problem
(
rank
,
data_size
,
batch_size
):
def
get_problem
(
rank
,
data_size
,
batch_size
,
device
):
# Standard RN101
# Standard RN101
model
=
resnet101
(
pretrained
=
False
,
progress
=
True
).
to
(
rank
)
model
=
resnet101
(
pretrained
=
False
,
progress
=
True
).
to
(
device
)
# Data setup, dummy data
# Data setup, dummy data
def
collate
(
inputs
:
List
[
Any
]):
def
collate
(
inputs
:
List
[
Any
]):
return
{
return
{
"inputs"
:
torch
.
stack
([
i
[
0
]
for
i
in
inputs
]).
to
(
torch
.
device
(
rank
)
),
"inputs"
:
torch
.
stack
([
i
[
0
]
for
i
in
inputs
]).
to
(
device
),
"label"
:
torch
.
stack
([
i
[
1
]
for
i
in
inputs
]).
to
(
torch
.
device
(
rank
)
),
"label"
:
torch
.
stack
([
i
[
1
]
for
i
in
inputs
]).
to
(
device
),
}
}
dataloader
=
DataLoader
(
dataloader
=
DataLoader
(
...
@@ -59,20 +59,13 @@ class OptimType(str, Enum):
...
@@ -59,20 +59,13 @@ class OptimType(str, Enum):
def
train
(
def
train
(
rank
:
int
,
rank
:
int
,
world_size
:
int
,
args
:
argparse
.
Namespace
,
num_epochs
:
int
=
10
,
batch_size
:
int
=
32
,
data_size
:
int
=
200
,
backend
:
str
=
"gloo"
,
backend
:
str
=
"gloo"
,
optim_type
:
OptimType
=
OptimType
.
vanilla
,
optim_type
:
OptimType
=
OptimType
.
vanilla
,
profile
:
bool
=
False
,
check_regression
:
bool
=
True
,
check_regression
:
bool
=
True
,
reference_speed
:
float
=
-
1.0
,
reference_memory
:
float
=
-
1.0
,
reference_loss
:
float
=
-
1.0
,
):
):
# DDP
# DDP
dist_init
(
rank
=
rank
,
world_size
=
world_size
,
backend
=
backend
)
dist_init
(
rank
=
rank
,
world_size
=
args
.
world_size
,
backend
=
backend
)
# Setup
# Setup
torch
.
cuda
.
set_device
(
rank
)
torch
.
cuda
.
set_device
(
rank
)
...
@@ -84,7 +77,8 @@ def train(
...
@@ -84,7 +77,8 @@ def train(
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
torch
.
backends
.
cudnn
.
benchmark
=
False
model
,
dataloader
,
loss_fn
=
get_problem
(
rank
,
data_size
,
batch_size
)
device
=
torch
.
device
(
"cpu"
)
if
args
.
cpu
else
torch
.
device
(
rank
)
model
,
dataloader
,
loss_fn
=
get_problem
(
rank
,
args
.
data_size
,
args
.
batch_size
,
device
)
# Shard the optimizer
# Shard the optimizer
optimizer
:
Optional
[
torch
.
optim
.
Optimizer
]
=
None
optimizer
:
Optional
[
torch
.
optim
.
Optimizer
]
=
None
...
@@ -94,7 +88,7 @@ def train(
...
@@ -94,7 +88,7 @@ def train(
module
=
model
,
module
=
model
,
optimizer
=
OPTIM
,
optimizer
=
OPTIM
,
optimizer_params
=
{
"lr"
:
1e-4
,
"momentum"
:
0.9
},
optimizer_params
=
{
"lr"
:
1e-4
,
"momentum"
:
0.9
},
world_size
=
world_size
,
world_size
=
args
.
world_size
,
broadcast_buffers
=
True
,
broadcast_buffers
=
True
,
)
)
ddp
.
train
()
ddp
.
train
()
...
@@ -109,18 +103,19 @@ def train(
...
@@ -109,18 +103,19 @@ def train(
)
)
# Reset the memory use counter
# Reset the memory use counter
torch
.
cuda
.
reset_peak_memory_stats
(
rank
)
if
not
args
.
cpu
:
torch
.
cuda
.
reset_peak_memory_stats
(
rank
)
torch
.
cuda
.
synchronize
(
rank
)
# Dummy training loop
# Dummy training loop
torch
.
cuda
.
synchronize
(
rank
)
training_start
=
time
.
monotonic
()
training_start
=
time
.
monotonic
()
model
.
train
()
model
.
train
()
measurements
=
[]
measurements
=
[]
final_loss
:
Optional
[
float
]
=
-
1.0
final_loss
:
Optional
[
float
]
=
-
1.0
need_profiling
=
profile
need_profiling
=
args
.
profile
for
epoch
in
range
(
num_
epochs
):
for
epoch
in
range
(
args
.
epochs
):
epoch_start
=
time
.
monotonic
()
epoch_start
=
time
.
monotonic
()
for
batch
in
dataloader
:
for
batch
in
dataloader
:
...
@@ -129,7 +124,6 @@ def train(
...
@@ -129,7 +124,6 @@ def train(
model
.
zero_grad
()
model
.
zero_grad
()
outputs
=
model
(
batch
[
"inputs"
])
outputs
=
model
(
batch
[
"inputs"
])
loss
=
loss_fn
(
outputs
,
batch
[
"label"
])
loss
=
loss_fn
(
outputs
,
batch
[
"label"
])
loss
/=
world_size
loss
.
backward
()
loss
.
backward
()
if
optim_type
==
OptimType
.
oss_sdp
:
if
optim_type
==
OptimType
.
oss_sdp
:
...
@@ -137,7 +131,7 @@ def train(
...
@@ -137,7 +131,7 @@ def train(
return
loss
return
loss
if
need_profiling
:
if
need_profiling
and
not
args
.
cpu
:
print
(
"Profiling the run"
)
print
(
"Profiling the run"
)
with
profiler
.
profile
(
use_cuda
=
True
)
as
prof
:
# type: ignore
with
profiler
.
profile
(
use_cuda
=
True
)
as
prof
:
# type: ignore
with
profiler
.
record_function
(
"batch"
):
with
profiler
.
record_function
(
"batch"
):
...
@@ -163,13 +157,14 @@ def train(
...
@@ -163,13 +157,14 @@ def train(
_
=
optimizer
.
state_dict
()
_
=
optimizer
.
state_dict
()
print
(
"... State dict collected"
)
print
(
"... State dict collected"
)
measurements
.
append
(
data_size
/
(
epoch_end
-
epoch_start
))
measurements
.
append
(
args
.
data_size
/
(
epoch_end
-
epoch_start
))
if
dist
.
get_rank
()
==
0
:
if
dist
.
get_rank
()
==
0
:
print
(
f
"Epoch
{
epoch
}
- processed
{
measurements
[
-
1
]:.
2
f
}
img per sec. Loss
{
final_loss
:.
3
f
}
"
)
print
(
f
"Epoch
{
epoch
}
- processed
{
measurements
[
-
1
]:.
2
f
}
img per sec. Loss
{
final_loss
:.
3
f
}
"
)
torch
.
cuda
.
synchronize
(
rank
)
if
not
args
.
cpu
:
torch
.
cuda
.
synchronize
(
rank
)
training_stop
=
time
.
monotonic
()
training_stop
=
time
.
monotonic
()
img_per_sec
=
data_size
/
(
training_stop
-
training_start
)
*
num_
epochs
img_per_sec
=
args
.
data_size
/
(
training_stop
-
training_start
)
*
args
.
epochs
max_memory
=
torch
.
cuda
.
max_memory_allocated
(
rank
)
/
2
**
20
max_memory
=
torch
.
cuda
.
max_memory_allocated
(
rank
)
/
2
**
20
print
(
f
"[
{
dist
.
get_rank
()
}
] : Training done.
{
img_per_sec
:.
2
f
}
img per sec overall"
)
print
(
f
"[
{
dist
.
get_rank
()
}
] : Training done.
{
img_per_sec
:.
2
f
}
img per sec overall"
)
...
@@ -182,9 +177,9 @@ def train(
...
@@ -182,9 +177,9 @@ def train(
print
(
f
"[
{
dist
.
get_rank
()
}
] : Mean speed:
{
mean
:.
2
f
}
+/-
{
std
:.
2
f
}
"
)
print
(
f
"[
{
dist
.
get_rank
()
}
] : Mean speed:
{
mean
:.
2
f
}
+/-
{
std
:.
2
f
}
"
)
if
check_regression
and
dist
.
get_rank
()
==
0
:
if
check_regression
and
dist
.
get_rank
()
==
0
:
assert
(
mean
+
3.0
*
std
)
>
reference_speed
,
"Speed regression detected"
assert
(
mean
+
3.0
*
std
)
>
args
.
reference_speed
,
"Speed regression detected"
assert
max_memory
<
1.05
*
reference_memory
,
"Memory use regression detected"
assert
max_memory
<
1.05
*
args
.
reference_memory
,
"Memory use regression detected"
assert
abs
(
cast
(
float
,
final_loss
)
-
reference_loss
)
<
1e-3
,
"Loss regression detected"
assert
abs
(
cast
(
float
,
final_loss
)
-
args
.
reference_loss
)
<
1e-3
,
"Loss regression detected"
print
(
"[Regression Test] VALID"
)
print
(
"[Regression Test] VALID"
)
...
@@ -208,26 +203,18 @@ if __name__ == "__main__":
...
@@ -208,26 +203,18 @@ if __name__ == "__main__":
)
)
parser
.
add_argument
(
"--gloo"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--gloo"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--cpu"
,
action
=
"store_true"
,
default
=
False
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
f
"Benchmark arguments:
{
args
}
"
)
print
(
f
"Benchmark arguments:
{
args
}
"
)
backend
=
"nccl"
if
not
args
.
gloo
or
not
torch
.
cuda
.
is_available
()
else
"gloo"
backend
=
"nccl"
if
(
not
args
.
gloo
or
not
torch
.
cuda
.
is_available
()
)
and
not
args
.
cpu
else
"gloo"
if
args
.
optim_type
==
OptimType
.
vanilla
or
args
.
optim_type
==
OptimType
.
everyone
:
if
args
.
optim_type
==
OptimType
.
vanilla
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark vanilla optimizer"
)
print
(
"
\n
Benchmark vanilla optimizer"
)
mp
.
spawn
(
mp
.
spawn
(
train
,
train
,
args
=
(
args
=
(
args
,
backend
,
OptimType
.
vanilla
,
False
,),
# no regression check
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
vanilla
,
args
.
profile
,
False
,
# no regression check
),
nprocs
=
args
.
world_size
,
nprocs
=
args
.
world_size
,
join
=
True
,
join
=
True
,
)
)
...
@@ -235,41 +222,14 @@ if __name__ == "__main__":
...
@@ -235,41 +222,14 @@ if __name__ == "__main__":
if
args
.
optim_type
==
OptimType
.
oss
or
args
.
optim_type
==
OptimType
.
everyone
:
if
args
.
optim_type
==
OptimType
.
oss
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark OSS with DDP"
)
print
(
"
\n
Benchmark OSS with DDP"
)
mp
.
spawn
(
mp
.
spawn
(
train
,
train
,
args
=
(
args
,
backend
,
OptimType
.
oss
,
args
.
check_regression
),
nprocs
=
args
.
world_size
,
join
=
True
,
args
=
(
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
oss
,
args
.
profile
,
args
.
check_regression
,
args
.
reference_speed
,
args
.
reference_memory
,
args
.
reference_loss
,
),
nprocs
=
args
.
world_size
,
join
=
True
,
)
)
if
args
.
optim_type
==
OptimType
.
oss_sdp
or
args
.
optim_type
==
OptimType
.
everyone
:
if
args
.
optim_type
==
OptimType
.
oss_sdp
or
args
.
optim_type
==
OptimType
.
everyone
:
print
(
"
\n
Benchmark OSS with SDP"
)
print
(
"
\n
Benchmark OSS with SDP"
)
mp
.
spawn
(
mp
.
spawn
(
train
,
train
,
args
=
(
args
=
(
args
,
backend
,
OptimType
.
oss_sdp
,
False
,),
# FIXME: @lefaudeux - SDP should give the same results
args
.
world_size
,
args
.
epochs
,
args
.
batch_size
,
args
.
data_size
,
backend
,
OptimType
.
oss_sdp
,
args
.
profile
,
False
,
# FIXME: @lefaudeux - SDP should give the same results
-
1
,
# Not checking SDP for speed regression for now, still slower than OSS
args
.
reference_memory
,
args
.
reference_loss
,
),
nprocs
=
args
.
world_size
,
nprocs
=
args
.
world_size
,
join
=
True
,
join
=
True
,
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment