Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
fairscale
Commits
7bdb9a7f
Unverified
Commit
7bdb9a7f
authored
Jun 10, 2021
by
anj-s
Committed by
GitHub
Jun 10, 2021
Browse files
remove examples dir (#712)
parent
370b8483
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
0 additions
and
499 deletions
+0
-499
examples/helpers.py
examples/helpers.py
+0
-22
examples/mnist_test_oss.py
examples/mnist_test_oss.py
+0
-145
examples/mnist_test_pipe.py
examples/mnist_test_pipe.py
+0
-135
examples/tutorial_oss.py
examples/tutorial_oss.py
+0
-85
examples/tutorial_pipe.py
examples/tutorial_pipe.py
+0
-38
examples/tutorial_pipe_rpc.py
examples/tutorial_pipe_rpc.py
+0
-74
No files found.
examples/helpers.py
deleted
100644 → 0
View file @
370b8483
import
torch
import
torch.distributed
as
dist
import
torch.nn
as
nn
import
torch.nn.functional
as
F
def
dist_init
(
rank
,
world_size
):
backend
=
dist
.
Backend
.
NCCL
if
torch
.
cuda
.
is_available
()
else
dist
.
Backend
.
GLOO
# type: ignore
print
(
f
"Using backend:
{
backend
}
"
)
dist
.
init_process_group
(
backend
=
backend
,
init_method
=
"tcp://localhost:29501"
,
rank
=
rank
,
world_size
=
world_size
)
def
get_model
():
return
nn
.
Sequential
(
torch
.
nn
.
Linear
(
10
,
10
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
10
,
5
))
def
get_data
(
n_batches
=
1
):
return
[(
torch
.
randn
(
20
,
10
),
torch
.
randint
(
0
,
2
,
size
=
(
20
,
1
)).
squeeze
())
for
i
in
range
(
n_batches
)]
def
get_loss_fun
():
return
F
.
nll_loss
examples/mnist_test_oss.py
deleted
100644 → 0
View file @
370b8483
# adapted from https://github.com/pytorch/examples/blob/master/mnist/main.py
from
__future__
import
print_function
import
argparse
import
time
import
torch
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
fairscale.nn.data_parallel
import
ShardedDataParallel
WORLD_SIZE
=
2
OPTIM
=
torch
.
optim
.
RMSprop
BACKEND
=
dist
.
Backend
.
NCCL
if
torch
.
cuda
.
is_available
()
else
dist
.
Backend
.
GLOO
def
dist_init
(
rank
,
world_size
,
backend
):
print
(
f
"Using backend:
{
backend
}
"
)
dist
.
init_process_group
(
backend
=
backend
,
init_method
=
"tcp://localhost:29501"
,
rank
=
rank
,
world_size
=
world_size
)
class
Net
(
nn
.
Module
):
def
__init__
(
self
):
super
(
Net
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
1
,
32
,
3
,
1
)
self
.
conv2
=
nn
.
Conv2d
(
32
,
64
,
3
,
1
)
self
.
dropout1
=
nn
.
Dropout2d
(
0.25
)
self
.
dropout2
=
nn
.
Dropout2d
(
0.5
)
self
.
fc1
=
nn
.
Linear
(
9216
,
128
)
self
.
fc2
=
nn
.
Linear
(
128
,
10
)
def
forward
(
self
,
x
):
x
=
self
.
conv1
(
x
)
x
=
F
.
relu
(
x
)
x
=
self
.
conv2
(
x
)
x
=
F
.
relu
(
x
)
x
=
F
.
max_pool2d
(
x
,
2
)
x
=
self
.
dropout1
(
x
)
x
=
torch
.
flatten
(
x
,
1
)
x
=
self
.
fc1
(
x
)
x
=
F
.
relu
(
x
)
x
=
self
.
dropout2
(
x
)
x
=
self
.
fc2
(
x
)
output
=
F
.
log_softmax
(
x
,
dim
=
1
)
return
output
def
train
(
rank
,
args
,
model
,
device
,
train_loader
,
num_epochs
):
##############
# SETUP
dist_init
(
rank
,
WORLD_SIZE
,
BACKEND
)
ddp
=
ShardedDataParallel
(
module
=
model
,
optimizer
=
torch
.
optim
.
Adadelta
,
optimizer_params
=
{
"lr"
:
1e-4
},
world_size
=
WORLD_SIZE
,
broadcast_buffers
=
True
,
)
ddp
.
train
()
optimizer
=
ddp
.
optimizer
# Reset the memory use counter
torch
.
cuda
.
reset_peak_memory_stats
(
rank
)
# Training loop
torch
.
cuda
.
synchronize
(
rank
)
training_start
=
time
.
monotonic
()
loss_fn
=
nn
.
CrossEntropyLoss
()
##############
model
.
train
()
measurements
=
[]
for
epoch
in
range
(
num_epochs
):
epoch_start
=
time
.
monotonic
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
def
closure
():
model
.
zero_grad
()
outputs
=
model
(
data
)
loss
=
loss_fn
(
outputs
,
target
)
loss
.
backward
()
ddp
.
reduce
()
# Send the gradients to the appropriate shards
return
loss
optimizer
.
step
(
closure
)
epoch_end
=
time
.
monotonic
()
torch
.
cuda
.
synchronize
(
rank
)
training_stop
=
time
.
monotonic
()
print
(
"Total Time:"
,
training_stop
-
training_start
)
def
main
():
# Training settings
parser
=
argparse
.
ArgumentParser
(
description
=
"PyTorch MNIST Example"
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
default
=
64
,
metavar
=
"N"
,
help
=
"input batch size for training (default: 64)"
)
parser
.
add_argument
(
"--test-batch-size"
,
type
=
int
,
default
=
1000
,
metavar
=
"N"
,
help
=
"input batch size for testing (default: 1000)"
)
parser
.
add_argument
(
"--epochs"
,
type
=
int
,
default
=
14
,
metavar
=
"N"
,
help
=
"number of epochs to train (default: 14)"
)
parser
.
add_argument
(
"--lr"
,
type
=
float
,
default
=
1.0
,
metavar
=
"LR"
,
help
=
"learning rate (default: 1.0)"
)
parser
.
add_argument
(
"--gamma"
,
type
=
float
,
default
=
0.7
,
metavar
=
"M"
,
help
=
"Learning rate step gamma (default: 0.7)"
)
parser
.
add_argument
(
"--no-cuda"
,
action
=
"store_true"
,
default
=
False
,
help
=
"disables CUDA training"
)
parser
.
add_argument
(
"--dry-run"
,
action
=
"store_true"
,
default
=
False
,
help
=
"quickly check a single pass"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
1
,
metavar
=
"S"
,
help
=
"random seed (default: 1)"
)
parser
.
add_argument
(
"--log-interval"
,
type
=
int
,
default
=
10
,
metavar
=
"N"
,
help
=
"how many batches to wait before logging training status"
,
)
parser
.
add_argument
(
"--save-model"
,
action
=
"store_true"
,
default
=
False
,
help
=
"For Saving the current Model"
)
args
=
parser
.
parse_args
()
use_cuda
=
not
args
.
no_cuda
and
torch
.
cuda
.
is_available
()
torch
.
manual_seed
(
args
.
seed
)
device
=
torch
.
device
(
"cuda"
if
use_cuda
else
"cpu"
)
kwargs
=
{
"batch_size"
:
args
.
batch_size
}
if
use_cuda
:
kwargs
.
update
({
"num_workers"
:
1
,
"pin_memory"
:
True
,
"shuffle"
:
True
},)
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
dataset1
=
datasets
.
MNIST
(
"../data"
,
train
=
True
,
download
=
True
,
transform
=
transform
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset1
,
**
kwargs
)
model
=
Net
().
to
(
device
)
mp
.
spawn
(
train
,
args
=
(
args
,
model
,
device
,
train_loader
,
args
.
epochs
),
nprocs
=
WORLD_SIZE
,
join
=
True
,
)
if
__name__
==
"__main__"
:
main
()
examples/mnist_test_pipe.py
deleted
100644 → 0
View file @
370b8483
# adapted from https://github.com/pytorch/examples/blob/master/mnist/main.py
from
__future__
import
print_function
import
argparse
import
time
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.optim
as
optim
from
torch.optim.lr_scheduler
import
StepLR
from
torchvision
import
datasets
,
transforms
from
fairscale.nn
import
Pipe
net
=
nn
.
Sequential
(
nn
.
Conv2d
(
1
,
32
,
3
,
1
),
nn
.
ReLU
(),
nn
.
Conv2d
(
32
,
64
,
3
,
1
),
nn
.
ReLU
(),
nn
.
MaxPool2d
(
kernel_size
=
2
),
nn
.
Dropout2d
(
0.25
),
nn
.
Flatten
(
1
),
nn
.
Linear
(
9216
,
128
),
nn
.
ReLU
(),
nn
.
Dropout2d
(
0.5
),
nn
.
Linear
(
128
,
10
),
nn
.
LogSoftmax
(
dim
=
1
),
)
def
train
(
args
,
model
,
device
,
train_loader
,
optimizer
,
epoch
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
.
to
(
device
),
target
.
to
(
device
))
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
args
.
log_interval
==
0
:
print
(
"Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}"
.
format
(
epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.0
*
batch_idx
/
len
(
train_loader
),
loss
.
item
(),
)
)
if
args
.
dry_run
:
break
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
.
to
(
device
),
target
.
to
(
device
),
reduction
=
"sum"
).
item
()
# sum up batch loss
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
).
to
(
device
)
# get the index of the max log-probability
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
"
\n
Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)
\n
"
.
format
(
test_loss
,
correct
,
len
(
test_loader
.
dataset
),
100.0
*
correct
/
len
(
test_loader
.
dataset
)
)
)
def
main
():
# Training settings
parser
=
argparse
.
ArgumentParser
(
description
=
"PyTorch MNIST Example"
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
default
=
64
,
metavar
=
"N"
,
help
=
"input batch size for training (default: 64)"
)
parser
.
add_argument
(
"--test-batch-size"
,
type
=
int
,
default
=
1000
,
metavar
=
"N"
,
help
=
"input batch size for testing (default: 1000)"
)
parser
.
add_argument
(
"--epochs"
,
type
=
int
,
default
=
14
,
metavar
=
"N"
,
help
=
"number of epochs to train (default: 14)"
)
parser
.
add_argument
(
"--lr"
,
type
=
float
,
default
=
1.0
,
metavar
=
"LR"
,
help
=
"learning rate (default: 1.0)"
)
parser
.
add_argument
(
"--gamma"
,
type
=
float
,
default
=
0.7
,
metavar
=
"M"
,
help
=
"Learning rate step gamma (default: 0.7)"
)
parser
.
add_argument
(
"--dry-run"
,
action
=
"store_true"
,
default
=
False
,
help
=
"quickly check a single pass"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
1
,
metavar
=
"S"
,
help
=
"random seed (default: 1)"
)
parser
.
add_argument
(
"--log-interval"
,
type
=
int
,
default
=
10
,
metavar
=
"N"
,
help
=
"how many batches to wait before logging training status"
,
)
parser
.
add_argument
(
"--save-model"
,
action
=
"store_true"
,
default
=
False
,
help
=
"For Saving the current Model"
)
args
=
parser
.
parse_args
()
torch
.
manual_seed
(
args
.
seed
)
kwargs
=
{
"batch_size"
:
args
.
batch_size
}
kwargs
.
update
({
"num_workers"
:
1
,
"pin_memory"
:
True
,
"shuffle"
:
True
},)
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
dataset1
=
datasets
.
MNIST
(
"../data"
,
train
=
True
,
download
=
True
,
transform
=
transform
)
dataset2
=
datasets
.
MNIST
(
"../data"
,
train
=
False
,
transform
=
transform
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset1
,
**
kwargs
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset2
,
**
kwargs
)
model
=
net
model
=
Pipe
(
model
,
balance
=
[
6
,
6
],
devices
=
[
0
,
1
],
chunks
=
2
)
device
=
model
.
devices
[
0
]
optimizer
=
optim
.
Adadelta
(
model
.
parameters
(),
lr
=
args
.
lr
)
scheduler
=
StepLR
(
optimizer
,
step_size
=
1
,
gamma
=
args
.
gamma
)
for
epoch
in
range
(
1
,
args
.
epochs
+
1
):
tic
=
time
.
perf_counter
()
train
(
args
,
model
,
device
,
train_loader
,
optimizer
,
epoch
)
toc
=
time
.
perf_counter
()
print
(
f
">>> TRANING Time
{
toc
-
tic
:
0.4
f
}
seconds"
)
tic
=
time
.
perf_counter
()
test
(
model
,
device
,
test_loader
)
toc
=
time
.
perf_counter
()
print
(
f
">>> TESTING Time
{
toc
-
tic
:
0.4
f
}
seconds"
)
scheduler
.
step
()
if
args
.
save_model
:
torch
.
save
(
model
.
state_dict
(),
"mnist_cnn.pt"
)
if
__name__
==
"__main__"
:
main
()
examples/tutorial_oss.py
deleted
100644 → 0
View file @
370b8483
import
time
from
typing
import
Optional
,
Union
,
cast
from
helpers
import
dist_init
,
get_data
,
get_loss_fun
,
get_model
import
torch
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
from
fairscale.optim.oss
import
OSS
WORLD_SIZE
=
2
EPOCHS
=
3
DEVICE
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
def
train
(
rank
:
int
,
world_size
:
int
,
epochs
:
int
,
use_oss
:
bool
):
# DDP
dist_init
(
rank
,
world_size
)
device
=
torch
.
device
(
"cpu"
)
if
DEVICE
==
"cpu"
else
rank
# type:ignore
# Problem statement
model
=
get_model
().
to
(
device
)
dataloader
=
get_data
(
n_batches
=
1
)
loss_fn
=
get_loss_fun
()
optimizer
:
Optional
[
Union
[
OSS
,
torch
.
optim
.
SGD
]]
=
None
if
not
use_oss
:
optimizer
=
torch
.
optim
.
SGD
(
params
=
model
.
parameters
(),
lr
=
1e-4
)
else
:
base_optimizer
=
torch
.
optim
.
SGD
base_optimizer_arguments
=
{
"lr"
:
1e-4
}
# any optimizer specific arguments, LR, momentum, etc...
optimizer
=
OSS
(
params
=
model
.
parameters
(),
optim
=
base_optimizer
,
broadcast_buffer_size
=
2
**
17
,
**
base_optimizer_arguments
)
training_start
=
time
.
monotonic
()
# Any relevant training loop, nothing specific to OSS. For example:
model
.
train
()
for
_
in
range
(
epochs
):
for
(
data
,
target
)
in
dataloader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
# Train
model
.
zero_grad
()
outputs
=
model
(
data
)
loss
=
loss_fn
(
outputs
,
target
)
loss
.
backward
()
# if you want to clip the gradients / get the current max:
max_norm
=
1000.0
norm_type
=
1
if
not
use_oss
:
_total_norm
=
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
max_norm
,
norm_type
=
norm_type
)
# type: ignore
else
:
optimizer
=
cast
(
OSS
,
optimizer
)
_total_norm
=
optimizer
.
clip_grad_norm
(
max_norm
,
norm_type
=
norm_type
)
optimizer
.
step
()
print
(
f
"Loss:
{
loss
.
item
()
}
"
)
training_end
=
time
.
monotonic
()
print
(
f
"[
{
dist
.
get_rank
()
}
] : Training done.
{
training_end
-
training_start
:.
2
f
}
sec"
)
if
DEVICE
==
"cuda"
:
max_memory
=
torch
.
cuda
.
max_memory_allocated
(
rank
)
print
(
f
"[
{
dist
.
get_rank
()
}
] : Peak memory
{
max_memory
:.
1
f
}
MiB"
)
if
__name__
==
"__main__"
:
training_start1
=
time
.
monotonic
()
mp
.
spawn
(
train
,
args
=
(
WORLD_SIZE
,
EPOCHS
,
False
),
nprocs
=
WORLD_SIZE
,
join
=
True
)
training_end1
=
time
.
monotonic
()
training_start2
=
time
.
monotonic
()
mp
.
spawn
(
train
,
args
=
(
WORLD_SIZE
,
EPOCHS
,
True
),
nprocs
=
WORLD_SIZE
,
join
=
True
)
training_end2
=
time
.
monotonic
()
print
(
"Total Time without:"
,
training_end1
-
training_start1
)
print
(
"Total Time with:"
,
training_end2
-
training_start2
)
examples/tutorial_pipe.py
deleted
100644 → 0
View file @
370b8483
from
helpers
import
get_data
,
get_loss_fun
,
get_model
import
torch
import
torch.optim
as
optim
import
fairscale
DEVICE
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
RANK
=
0
# example
model
=
get_model
()
data
,
target
=
get_data
()[
0
]
loss_fn
=
get_loss_fun
()
model
=
fairscale
.
nn
.
Pipe
(
model
,
balance
=
[
2
,
1
])
# define optimizer and loss function
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.001
)
# zero the parameter gradients
optimizer
.
zero_grad
()
device
=
torch
.
device
(
"cuda"
,
RANK
)
if
DEVICE
==
"cuda"
else
torch
.
device
(
"cpu"
)
# outputs and target need to be on the same device
# forward step
outputs
=
model
(
data
.
to
(
device
).
requires_grad_
())
# compute loss
loss
=
loss_fn
(
outputs
.
to
(
device
),
target
.
to
(
device
))
# backward + optimize
loss
.
backward
()
optimizer
.
step
()
print
(
"Finished Training Step"
)
del
model
examples/tutorial_pipe_rpc.py
deleted
100644 → 0
View file @
370b8483
# run with:
# mpirun -np 2 --host localhost:2 -x PYTHONPATH=$PWD python # examples/tutorial_pipe_rpc.py
import
os
from
helpers
import
dist_init
,
get_data
,
get_loss_fun
,
get_model
import
torch
import
torch.optim
as
optim
import
torch_pg
import
fairscale
from
fairscale.nn.model_parallel
import
initialize_model_parallel
def
register_optimizer
(
ctx
,
model
):
# Set the optimizer as an attribute on the model so we can access it later
model
.
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
**
ctx
)
# zero the parameter gradients
model
.
optimizer
.
zero_grad
()
def
run_optimizer
(
ctx
,
model
):
model
.
optimizer
.
step
()
def
run
(
rank
,
world_size
):
torch_pg
.
init_mpi
()
os
.
environ
[
"MASTER_ADDR"
]
=
"localhost"
os
.
environ
[
"MASTER_PORT"
]
=
"10638"
dist_init
(
rank
,
world_size
)
# FIXME (supports gloo)
os
.
environ
[
"MASTER_PORT"
]
=
"10639"
torch
.
distributed
.
rpc
.
init_rpc
(
f
"worker
{
rank
}
"
,
rank
=
rank
,
world_size
=
world_size
)
initialize_model_parallel
(
1
,
world_size
,
pipeline_backend
=
"mpi"
)
if
rank
==
1
:
# For RPC, all ranks other than 0 just need to call rpc.shutdown()
torch
.
distributed
.
rpc
.
shutdown
()
return
model
=
get_model
()
data
,
target
=
get_data
()[
0
]
loss_fn
=
get_loss_fun
()
device
=
torch
.
device
(
"cuda"
,
rank
)
model
=
fairscale
.
nn
.
PipeRPCWrapper
(
model
,
balance
=
[
2
,
1
],
worker_map
=
{
0
:
"worker0"
,
1
:
"worker1"
},
# Needed to convert ranks to RPC worker names
input_device
=
device
,
).
to
(
device
)
# We can't directly access the model on each worker, so we need to call
# foreach_worker with a callback to setup the optimizer
model
.
foreach_worker
(
register_optimizer
,
{
"lr"
:
0.001
},
include_self
=
True
)
outputs
=
model
(
data
.
to
(
device
))
loss
=
loss_fn
(
outputs
.
to
(
device
),
target
.
to
(
device
))
loss
.
backward
()
# Same as earlier, use foreach_worker to step the optimizer on each rank
model
.
foreach_worker
(
run_optimizer
,
include_self
=
True
)
print
(
f
"Finished Training Step on
{
rank
}
"
)
torch
.
distributed
.
rpc
.
shutdown
()
del
model
if
__name__
==
"__main__"
:
rank
=
int
(
os
.
environ
[
"OMPI_COMM_WORLD_RANK"
])
world_size
=
int
(
os
.
environ
[
"OMPI_COMM_WORLD_SIZE"
])
run
(
rank
,
world_size
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment