Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
9ff1f9d4
Unverified
Commit
9ff1f9d4
authored
Dec 18, 2018
by
SparkSnail
Committed by
GitHub
Dec 18, 2018
Browse files
Add mnist-distributed-pytorch example (#483)
Add mnist-distributed-pytorch example for kubeflow
parent
56d0f08c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
218 additions
and
0 deletions
+218
-0
examples/trials/mnist-distributed-pytorch/config_kubeflow.yml
...ples/trials/mnist-distributed-pytorch/config_kubeflow.yml
+40
-0
examples/trials/mnist-distributed-pytorch/dist_mnist.py
examples/trials/mnist-distributed-pytorch/dist_mnist.py
+174
-0
examples/trials/mnist-distributed-pytorch/search_space.json
examples/trials/mnist-distributed-pytorch/search_space.json
+4
-0
No files found.
examples/trials/mnist-distributed-pytorch/config_kubeflow.yml
0 → 100644
View file @
9ff1f9d4
authorName
:
default
experimentName
:
example_mnist_distributed_pytorch
trialConcurrency
:
1
maxExecDuration
:
1h
maxTrialNum
:
10
#choice: local, remote, pai, kubeflow
trainingServicePlatform
:
kubeflow
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
#choice: TPE, Random, Anneal, Evolution
builtinTunerName
:
TPE
classArgs
:
#choice: maximize, minimize
optimize_mode
:
minimize
trial
:
codeDir
:
.
master
:
replicas
:
1
command
:
python3 dist_mnist.py
gpuNum
:
1
cpuNum
:
1
memoryMB
:
2048
image
:
msranni/nni:latest
worker
:
replicas
:
1
command
:
python3 dist_mnist.py
gpuNum
:
0
cpuNum
:
1
memoryMB
:
2048
image
:
msranni/nni:latest
kubeflowConfig
:
operator
:
pytorch-operator
apiVersion
:
v1alpha2
nfs
:
# Your NFS server IP, like 10.10.10.10
server
:
{
your_nfs_server_ip
}
# Your NFS server export path, like /var/nfs/nni
path
:
{
your_nfs_server_export_path
}
examples/trials/mnist-distributed-pytorch/dist_mnist.py
0 → 100644
View file @
9ff1f9d4
# Copyright 2018 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# NNI (https://github.com/Microsoft/nni) modified this code to show how to
# integrate distributed pytorch training with NNI SDK
#
import
os
import
torch
import
torch.distributed
as
dist
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.optim
as
optim
import
nni
import
logging
from
math
import
ceil
from
random
import
Random
from
torch.autograd
import
Variable
from
torchvision
import
datasets
,
transforms
logger
=
logging
.
getLogger
(
'nni_pytorch_dist'
)
class
Partition
(
object
):
""" Dataset-like object, but only access a subset of it. """
def
__init__
(
self
,
data
,
index
):
self
.
data
=
data
self
.
index
=
index
def
__len__
(
self
):
return
len
(
self
.
index
)
def
__getitem__
(
self
,
index
):
data_idx
=
self
.
index
[
index
]
return
self
.
data
[
data_idx
]
class
DataPartitioner
(
object
):
""" Partitions a dataset into different chuncks. """
def
__init__
(
self
,
data
,
sizes
=
[
0.7
,
0.2
,
0.1
],
seed
=
1234
):
self
.
data
=
data
self
.
partitions
=
[]
rng
=
Random
()
rng
.
seed
(
seed
)
data_len
=
len
(
data
)
indexes
=
[
x
for
x
in
range
(
0
,
data_len
)]
rng
.
shuffle
(
indexes
)
for
frac
in
sizes
:
part_len
=
int
(
frac
*
data_len
)
self
.
partitions
.
append
(
indexes
[
0
:
part_len
])
indexes
=
indexes
[
part_len
:]
def
use
(
self
,
partition
):
return
Partition
(
self
.
data
,
self
.
partitions
[
partition
])
class
Net
(
nn
.
Module
):
""" Network architecture. """
def
__init__
(
self
):
super
(
Net
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
1
,
10
,
kernel_size
=
5
)
self
.
conv2
=
nn
.
Conv2d
(
10
,
20
,
kernel_size
=
5
)
self
.
conv2_drop
=
nn
.
Dropout2d
()
self
.
fc1
=
nn
.
Linear
(
320
,
50
)
self
.
fc2
=
nn
.
Linear
(
50
,
10
)
def
forward
(
self
,
x
):
x
=
F
.
relu
(
F
.
max_pool2d
(
self
.
conv1
(
x
),
2
))
x
=
F
.
relu
(
F
.
max_pool2d
(
self
.
conv2_drop
(
self
.
conv2
(
x
)),
2
))
x
=
x
.
view
(
-
1
,
320
)
x
=
F
.
relu
(
self
.
fc1
(
x
))
x
=
F
.
dropout
(
x
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
return
F
.
log_softmax
(
x
,
dim
=
1
)
def
partition_dataset
():
""" Partitioning MNIST """
dataset
=
datasets
.
MNIST
(
'./data'
,
train
=
True
,
download
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,
),
(
0.3081
,
))
]))
size
=
dist
.
get_world_size
()
bsz
=
128
/
float
(
size
)
partition_sizes
=
[
1.0
/
size
for
_
in
range
(
size
)]
partition
=
DataPartitioner
(
dataset
,
partition_sizes
)
partition
=
partition
.
use
(
dist
.
get_rank
())
train_set
=
torch
.
utils
.
data
.
DataLoader
(
partition
,
batch_size
=
int
(
bsz
),
shuffle
=
True
)
return
train_set
,
bsz
def
average_gradients
(
model
):
""" Gradient averaging. """
size
=
float
(
dist
.
get_world_size
())
for
param
in
model
.
parameters
():
dist
.
all_reduce
(
param
.
grad
.
data
,
op
=
dist
.
reduce_op
.
SUM
,
group
=
0
)
param
.
grad
.
data
/=
size
def
run
(
params
):
""" Distributed Synchronous SGD Example """
rank
=
dist
.
get_rank
()
torch
.
manual_seed
(
1234
)
train_set
,
bsz
=
partition_dataset
()
model
=
Net
()
model
=
model
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
params
[
'learning_rate'
],
momentum
=
params
[
'momentum'
])
num_batches
=
ceil
(
len
(
train_set
.
dataset
)
/
float
(
bsz
))
total_loss
=
0.0
for
epoch
in
range
(
3
):
epoch_loss
=
0.0
for
data
,
target
in
train_set
:
data
,
target
=
Variable
(
data
),
Variable
(
target
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
epoch_loss
+=
loss
.
item
()
loss
.
backward
()
average_gradients
(
model
)
optimizer
.
step
()
#logger.debug('Rank: ', rank, ', epoch: ', epoch, ': ', epoch_loss / num_batches)
if
rank
==
0
:
nni
.
report_intermediate_result
(
epoch_loss
/
num_batches
)
total_loss
+=
(
epoch_loss
/
num_batches
)
total_loss
/=
3
logger
.
debug
(
'Final loss: {}'
.
format
(
total_loss
))
if
rank
==
0
:
nni
.
report_final_result
(
total_loss
)
def
init_processes
(
fn
,
params
,
backend
=
'tcp'
):
""" Initialize the distributed environment. """
dist
.
init_process_group
(
backend
)
fn
(
params
)
def
generate_default_params
():
'''
Generate default parameters for mnist network.
'''
params
=
{
'learning_rate'
:
0.01
,
'momentum'
:
0.5
}
return
params
if
__name__
==
"__main__"
:
RCV_PARAMS
=
nni
.
get_next_parameter
()
logger
.
debug
(
RCV_PARAMS
)
params
=
generate_default_params
()
params
.
update
(
RCV_PARAMS
)
init_processes
(
run
,
params
)
examples/trials/mnist-distributed-pytorch/search_space.json
0 → 100644
View file @
9ff1f9d4
{
"learning_rate"
:{
"_type"
:
"choice"
,
"_value"
:[
0.0001
,
0.001
,
0.01
,
0.1
]},
"momentum"
:{
"_type"
:
"choice"
,
"_value"
:[
0.4
,
0.5
,
0.6
]}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment