Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
21165b53
Unverified
Commit
21165b53
authored
Mar 07, 2019
by
SparkSnail
Committed by
GitHub
Mar 07, 2019
Browse files
Merge pull request #138 from Microsoft/master
merge master
parents
41a9a598
f10c3311
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
573 additions
and
407 deletions
+573
-407
examples/trials/mnist-distributed/dist_mnist.py
examples/trials/mnist-distributed/dist_mnist.py
+264
-242
examples/trials/mnist-hyperband/mnist.py
examples/trials/mnist-hyperband/mnist.py
+11
-2
examples/trials/mnist/mnist.py
examples/trials/mnist/mnist.py
+11
-2
examples/trials/mnist/mnist_before.py
examples/trials/mnist/mnist_before.py
+11
-2
src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
.../pynni/nni/curvefitting_assessor/curvefitting_assessor.py
+5
-4
src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
...k/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
+12
-14
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+83
-53
src/webui/src/components/TrialsDetail.tsx
src/webui/src/components/TrialsDetail.tsx
+1
-1
test/config_test/examples/mnist-annotation.test.yml
test/config_test/examples/mnist-annotation.test.yml
+2
-2
test/config_test/examples/mnist.test.yml
test/config_test/examples/mnist.test.yml
+2
-2
test/config_test/multi_phase/multi_phase.test.yml
test/config_test/multi_phase/multi_phase.test.yml
+2
-2
test/pipelines-it-kubeflow.yml
test/pipelines-it-kubeflow.yml
+32
-19
test/pipelines-it-pai.yml
test/pipelines-it-pai.yml
+35
-19
tools/nni_annotation/README_zh_CN.md
tools/nni_annotation/README_zh_CN.md
+63
-37
tools/nni_annotation/examples/mnist_generated.py
tools/nni_annotation/examples/mnist_generated.py
+17
-2
tools/nni_annotation/examples/mnist_with_annotation.py
tools/nni_annotation/examples/mnist_with_annotation.py
+11
-2
tools/nni_annotation/examples/mnist_without_annotation.py
tools/nni_annotation/examples/mnist_without_annotation.py
+11
-2
No files found.
examples/trials/mnist-distributed/dist_mnist.py
View file @
21165b53
...
...
@@ -13,10 +13,10 @@
# limitations under the License.
# ==============================================================================
#
# NNI (https://github.com/Microsoft/nni) modified this code to show how to
#
# NNI (https://github.com/Microsoft/nni) modified this code to show how to
# integrate distributed tensorflow training with NNI SDK
#
#
"""Distributed MNIST training and validation, with model replicas.
A simple softmax model with one hidden layer is defined. The parameters
...
...
@@ -54,19 +54,22 @@ import nni
flags
=
tf
.
app
.
flags
flags
.
DEFINE_string
(
"data_dir"
,
"/tmp/mnist-data"
,
"Directory for storing mnist data"
)
flags
.
DEFINE_boolean
(
"download_only"
,
False
,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training"
)
flags
.
DEFINE_integer
(
"task_index"
,
None
,
"Worker task index, should be >= 0. task_index=0 is "
"the master worker task the performs the variable "
"initialization "
)
flags
.
DEFINE_integer
(
"num_gpus"
,
1
,
"Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'"
)
flags
.
DEFINE_integer
(
"replicas_to_aggregate"
,
None
,
"Number of replicas to aggregate before parameter update"
"is applied (For sync_replicas mode only; default: "
"num_workers)"
)
flags
.
DEFINE_boolean
(
"download_only"
,
False
,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training"
)
flags
.
DEFINE_integer
(
"task_index"
,
None
,
"Worker task index, should be >= 0. task_index=0 is "
"the master worker task the performs the variable "
"initialization "
)
flags
.
DEFINE_integer
(
"num_gpus"
,
1
,
"Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'"
)
flags
.
DEFINE_integer
(
"replicas_to_aggregate"
,
None
,
"Number of replicas to aggregate before parameter update"
"is applied (For sync_replicas mode only; default: "
"num_workers)"
)
flags
.
DEFINE_integer
(
"train_steps"
,
20000
,
"Number of (global) training steps to perform"
)
flags
.
DEFINE_boolean
(
...
...
@@ -96,237 +99,256 @@ IMAGE_PIXELS = 28
# {'cluster': cluster,
# 'task': {'type': 'worker', 'index': 1}})
def
generate_default_params
():
'''
Generate default hyper parameters
'''
return
{
'learning_rate'
:
0.01
,
'batch_size'
:
100
,
'hidden_units'
:
100
,
}
'''
Generate default hyper parameters
'''
return
{
'learning_rate'
:
0.01
,
'batch_size'
:
100
,
'hidden_units'
:
100
,
}
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
unused_argv
):
# Receive NNI hyper parameter and update it onto default params
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
PARAMS
=
generate_default_params
()
PARAMS
.
update
(
RECEIVED_PARAMS
)
# Parse environment variable TF_CONFIG to get job_name and task_index
# If not explicitly specified in the constructor and the TF_CONFIG
# environment variable is present, load cluster_spec from TF_CONFIG.
tf_config
=
json
.
loads
(
os
.
environ
.
get
(
'TF_CONFIG'
)
or
'{}'
)
task_config
=
tf_config
.
get
(
'task'
,
{})
task_type
=
task_config
.
get
(
'type'
)
task_index
=
task_config
.
get
(
'index'
)
FLAGS
.
job_name
=
task_type
FLAGS
.
task_index
=
task_index
mnist
=
input_data
.
read_data_sets
(
FLAGS
.
data_dir
,
one_hot
=
True
)
if
FLAGS
.
download_only
:
sys
.
exit
(
0
)
if
FLAGS
.
job_name
is
None
or
FLAGS
.
job_name
==
""
:
raise
ValueError
(
"Must specify an explicit `job_name`"
)
if
FLAGS
.
task_index
is
None
or
FLAGS
.
task_index
==
""
:
raise
ValueError
(
"Must specify an explicit `task_index`"
)
print
(
"job name = %s"
%
FLAGS
.
job_name
)
print
(
"task index = %d"
%
FLAGS
.
task_index
)
cluster_config
=
tf_config
.
get
(
'cluster'
,
{})
ps_hosts
=
cluster_config
.
get
(
'ps'
)
worker_hosts
=
cluster_config
.
get
(
'worker'
)
ps_hosts_str
=
','
.
join
(
ps_hosts
)
worker_hosts_str
=
','
.
join
(
worker_hosts
)
FLAGS
.
ps_hosts
=
ps_hosts_str
FLAGS
.
worker_hosts
=
worker_hosts_str
# Construct the cluster and start the server
ps_spec
=
FLAGS
.
ps_hosts
.
split
(
","
)
worker_spec
=
FLAGS
.
worker_hosts
.
split
(
","
)
# Get the number of workers.
num_workers
=
len
(
worker_spec
)
cluster
=
tf
.
train
.
ClusterSpec
({
"ps"
:
ps_spec
,
"worker"
:
worker_spec
})
if
not
FLAGS
.
existing_servers
:
# Not using existing servers. Create an in-process server.
server
=
tf
.
train
.
Server
(
cluster
,
job_name
=
FLAGS
.
job_name
,
task_index
=
FLAGS
.
task_index
)
if
FLAGS
.
job_name
==
"ps"
:
server
.
join
()
is_chief
=
(
FLAGS
.
task_index
==
0
)
if
FLAGS
.
num_gpus
>
0
:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu
=
(
FLAGS
.
task_index
%
FLAGS
.
num_gpus
)
worker_device
=
"/job:worker/task:%d/gpu:%d"
%
(
FLAGS
.
task_index
,
gpu
)
elif
FLAGS
.
num_gpus
==
0
:
# Just allocate the CPU to worker server
cpu
=
0
worker_device
=
"/job:worker/task:%d/cpu:%d"
%
(
FLAGS
.
task_index
,
cpu
)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with
tf
.
device
(
tf
.
train
.
replica_device_setter
(
worker_device
=
worker_device
,
ps_device
=
"/job:ps/cpu:0"
,
cluster
=
cluster
)):
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
# Variables of the hidden layer
hid_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
IMAGE_PIXELS
*
IMAGE_PIXELS
,
PARAMS
[
'hidden_units'
]],
stddev
=
1.0
/
IMAGE_PIXELS
),
name
=
"hid_w"
)
hid_b
=
tf
.
Variable
(
tf
.
zeros
([
PARAMS
[
'hidden_units'
]]),
name
=
"hid_b"
)
# Variables of the softmax layer
sm_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
PARAMS
[
'hidden_units'
],
10
],
stddev
=
1.0
/
math
.
sqrt
(
PARAMS
[
'hidden_units'
])),
name
=
"sm_w"
)
sm_b
=
tf
.
Variable
(
tf
.
zeros
([
10
]),
name
=
"sm_b"
)
# Ops: located on the worker specified with FLAGS.task_index
x
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
IMAGE_PIXELS
*
IMAGE_PIXELS
])
y_
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
10
])
hid_lin
=
tf
.
nn
.
xw_plus_b
(
x
,
hid_w
,
hid_b
)
hid
=
tf
.
nn
.
relu
(
hid_lin
)
y
=
tf
.
nn
.
softmax
(
tf
.
nn
.
xw_plus_b
(
hid
,
sm_w
,
sm_b
))
cross_entropy
=
-
tf
.
reduce_sum
(
y_
*
tf
.
log
(
tf
.
clip_by_value
(
y
,
1e-10
,
1.0
)))
opt
=
tf
.
train
.
AdamOptimizer
(
PARAMS
[
'learning_rate'
])
if
FLAGS
.
sync_replicas
:
if
FLAGS
.
replicas_to_aggregate
is
None
:
replicas_to_aggregate
=
num_workers
else
:
replicas_to_aggregate
=
FLAGS
.
replicas_to_aggregate
opt
=
tf
.
train
.
SyncReplicasOptimizer
(
opt
,
replicas_to_aggregate
=
replicas_to_aggregate
,
total_num_replicas
=
num_workers
,
name
=
"mnist_sync_replicas"
)
train_step
=
opt
.
minimize
(
cross_entropy
,
global_step
=
global_step
)
if
FLAGS
.
sync_replicas
:
local_init_op
=
opt
.
local_step_init_op
if
is_chief
:
local_init_op
=
opt
.
chief_init_op
ready_for_local_init_op
=
opt
.
ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner
=
opt
.
get_chief_queue_runner
()
sync_init_op
=
opt
.
get_init_tokens_op
()
init_op
=
tf
.
global_variables_initializer
()
train_dir
=
tempfile
.
mkdtemp
()
if
FLAGS
.
sync_replicas
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
local_init_op
=
local_init_op
,
ready_for_local_init_op
=
ready_for_local_init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
else
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
sess_config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
,
log_device_placement
=
False
,
device_filters
=
[
"/job:ps"
,
"/job:worker/task:%d"
%
FLAGS
.
task_index
])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if
is_chief
:
print
(
"Worker %d: Initializing session..."
%
FLAGS
.
task_index
)
else
:
print
(
"Worker %d: Waiting for session to be initialized..."
%
FLAGS
.
task_index
)
if
FLAGS
.
existing_servers
:
server_grpc_url
=
"grpc://"
+
worker_spec
[
FLAGS
.
task_index
]
print
(
"Using existing server at: %s"
%
server_grpc_url
)
sess
=
sv
.
prepare_or_wait_for_session
(
server_grpc_url
,
config
=
sess_config
)
else
:
sess
=
sv
.
prepare_or_wait_for_session
(
server
.
target
,
config
=
sess_config
)
print
(
"Worker %d: Session initialization complete."
%
FLAGS
.
task_index
)
if
FLAGS
.
sync_replicas
and
is_chief
:
# Chief worker will start the chief queue runner and call the init op.
sess
.
run
(
sync_init_op
)
sv
.
start_queue_runners
(
sess
,
[
chief_queue_runner
])
# Perform training
time_begin
=
time
.
time
()
print
(
"Training begins @ %f"
%
time_begin
)
local_step
=
0
while
True
:
# Training feed
batch_xs
,
batch_ys
=
mnist
.
train
.
next_batch
(
PARAMS
[
'batch_size'
])
train_feed
=
{
x
:
batch_xs
,
y_
:
batch_ys
}
_
,
step
=
sess
.
run
([
train_step
,
global_step
],
feed_dict
=
train_feed
)
local_step
+=
1
now
=
time
.
time
()
print
(
"%f: Worker %d: training step %d done (global step: %d)"
%
(
now
,
FLAGS
.
task_index
,
local_step
,
step
))
if
step
>
0
and
step
%
5000
==
0
and
is_chief
:
# Receive NNI hyper parameter and update it onto default params
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
PARAMS
=
generate_default_params
()
PARAMS
.
update
(
RECEIVED_PARAMS
)
# Parse environment variable TF_CONFIG to get job_name and task_index
# If not explicitly specified in the constructor and the TF_CONFIG
# environment variable is present, load cluster_spec from TF_CONFIG.
tf_config
=
json
.
loads
(
os
.
environ
.
get
(
'TF_CONFIG'
)
or
'{}'
)
task_config
=
tf_config
.
get
(
'task'
,
{})
task_type
=
task_config
.
get
(
'type'
)
task_index
=
task_config
.
get
(
'index'
)
FLAGS
.
job_name
=
task_type
FLAGS
.
task_index
=
task_index
mnist
=
download_mnist_retry
(
FLAGS
.
data_dir
)
if
FLAGS
.
download_only
:
sys
.
exit
(
0
)
if
FLAGS
.
job_name
is
None
or
FLAGS
.
job_name
==
""
:
raise
ValueError
(
"Must specify an explicit `job_name`"
)
if
FLAGS
.
task_index
is
None
or
FLAGS
.
task_index
==
""
:
raise
ValueError
(
"Must specify an explicit `task_index`"
)
print
(
"job name = %s"
%
FLAGS
.
job_name
)
print
(
"task index = %d"
%
FLAGS
.
task_index
)
cluster_config
=
tf_config
.
get
(
'cluster'
,
{})
ps_hosts
=
cluster_config
.
get
(
'ps'
)
worker_hosts
=
cluster_config
.
get
(
'worker'
)
ps_hosts_str
=
','
.
join
(
ps_hosts
)
worker_hosts_str
=
','
.
join
(
worker_hosts
)
FLAGS
.
ps_hosts
=
ps_hosts_str
FLAGS
.
worker_hosts
=
worker_hosts_str
# Construct the cluster and start the server
ps_spec
=
FLAGS
.
ps_hosts
.
split
(
","
)
worker_spec
=
FLAGS
.
worker_hosts
.
split
(
","
)
# Get the number of workers.
num_workers
=
len
(
worker_spec
)
cluster
=
tf
.
train
.
ClusterSpec
({
"ps"
:
ps_spec
,
"worker"
:
worker_spec
})
if
not
FLAGS
.
existing_servers
:
# Not using existing servers. Create an in-process server.
server
=
tf
.
train
.
Server
(
cluster
,
job_name
=
FLAGS
.
job_name
,
task_index
=
FLAGS
.
task_index
)
if
FLAGS
.
job_name
==
"ps"
:
server
.
join
()
is_chief
=
(
FLAGS
.
task_index
==
0
)
if
FLAGS
.
num_gpus
>
0
:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu
=
(
FLAGS
.
task_index
%
FLAGS
.
num_gpus
)
worker_device
=
"/job:worker/task:%d/gpu:%d"
%
(
FLAGS
.
task_index
,
gpu
)
elif
FLAGS
.
num_gpus
==
0
:
# Just allocate the CPU to worker server
cpu
=
0
worker_device
=
"/job:worker/task:%d/cpu:%d"
%
(
FLAGS
.
task_index
,
cpu
)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with
tf
.
device
(
tf
.
train
.
replica_device_setter
(
worker_device
=
worker_device
,
ps_device
=
"/job:ps/cpu:0"
,
cluster
=
cluster
)):
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
# Variables of the hidden layer
hid_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
IMAGE_PIXELS
*
IMAGE_PIXELS
,
PARAMS
[
'hidden_units'
]],
stddev
=
1.0
/
IMAGE_PIXELS
),
name
=
"hid_w"
)
hid_b
=
tf
.
Variable
(
tf
.
zeros
([
PARAMS
[
'hidden_units'
]]),
name
=
"hid_b"
)
# Variables of the softmax layer
sm_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
PARAMS
[
'hidden_units'
],
10
],
stddev
=
1.0
/
math
.
sqrt
(
PARAMS
[
'hidden_units'
])),
name
=
"sm_w"
)
sm_b
=
tf
.
Variable
(
tf
.
zeros
([
10
]),
name
=
"sm_b"
)
# Ops: located on the worker specified with FLAGS.task_index
x
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
IMAGE_PIXELS
*
IMAGE_PIXELS
])
y_
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
10
])
hid_lin
=
tf
.
nn
.
xw_plus_b
(
x
,
hid_w
,
hid_b
)
hid
=
tf
.
nn
.
relu
(
hid_lin
)
y
=
tf
.
nn
.
softmax
(
tf
.
nn
.
xw_plus_b
(
hid
,
sm_w
,
sm_b
))
cross_entropy
=
-
tf
.
reduce_sum
(
y_
*
tf
.
log
(
tf
.
clip_by_value
(
y
,
1e-10
,
1.0
)))
opt
=
tf
.
train
.
AdamOptimizer
(
PARAMS
[
'learning_rate'
])
if
FLAGS
.
sync_replicas
:
if
FLAGS
.
replicas_to_aggregate
is
None
:
replicas_to_aggregate
=
num_workers
else
:
replicas_to_aggregate
=
FLAGS
.
replicas_to_aggregate
opt
=
tf
.
train
.
SyncReplicasOptimizer
(
opt
,
replicas_to_aggregate
=
replicas_to_aggregate
,
total_num_replicas
=
num_workers
,
name
=
"mnist_sync_replicas"
)
train_step
=
opt
.
minimize
(
cross_entropy
,
global_step
=
global_step
)
if
FLAGS
.
sync_replicas
:
local_init_op
=
opt
.
local_step_init_op
if
is_chief
:
local_init_op
=
opt
.
chief_init_op
ready_for_local_init_op
=
opt
.
ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner
=
opt
.
get_chief_queue_runner
()
sync_init_op
=
opt
.
get_init_tokens_op
()
init_op
=
tf
.
global_variables_initializer
()
train_dir
=
tempfile
.
mkdtemp
()
if
FLAGS
.
sync_replicas
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
local_init_op
=
local_init_op
,
ready_for_local_init_op
=
ready_for_local_init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
else
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
sess_config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
,
log_device_placement
=
False
,
device_filters
=
[
"/job:ps"
,
"/job:worker/task:%d"
%
FLAGS
.
task_index
])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if
is_chief
:
print
(
"Worker %d: Initializing session..."
%
FLAGS
.
task_index
)
else
:
print
(
"Worker %d: Waiting for session to be initialized..."
%
FLAGS
.
task_index
)
if
FLAGS
.
existing_servers
:
server_grpc_url
=
"grpc://"
+
worker_spec
[
FLAGS
.
task_index
]
print
(
"Using existing server at: %s"
%
server_grpc_url
)
sess
=
sv
.
prepare_or_wait_for_session
(
server_grpc_url
,
config
=
sess_config
)
else
:
sess
=
sv
.
prepare_or_wait_for_session
(
server
.
target
,
config
=
sess_config
)
print
(
"Worker %d: Session initialization complete."
%
FLAGS
.
task_index
)
if
FLAGS
.
sync_replicas
and
is_chief
:
# Chief worker will start the chief queue runner and call the init op.
sess
.
run
(
sync_init_op
)
sv
.
start_queue_runners
(
sess
,
[
chief_queue_runner
])
# Perform training
time_begin
=
time
.
time
()
print
(
"Training begins @ %f"
%
time_begin
)
local_step
=
0
while
True
:
# Training feed
batch_xs
,
batch_ys
=
mnist
.
train
.
next_batch
(
PARAMS
[
'batch_size'
])
train_feed
=
{
x
:
batch_xs
,
y_
:
batch_ys
}
_
,
step
=
sess
.
run
([
train_step
,
global_step
],
feed_dict
=
train_feed
)
local_step
+=
1
now
=
time
.
time
()
print
(
"%f: Worker %d: training step %d done (global step: %d)"
%
(
now
,
FLAGS
.
task_index
,
local_step
,
step
))
if
step
>
0
and
step
%
5000
==
0
and
is_chief
:
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
interim_val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
step
,
interim_val_xent
))
# Only chief worker can report intermediate metrics
nni
.
report_intermediate_result
(
interim_val_xent
)
if
step
>=
FLAGS
.
train_steps
:
break
time_end
=
time
.
time
()
print
(
"Training ends @ %f"
%
time_end
)
training_time
=
time_end
-
time_begin
print
(
"Training elapsed time: %f s"
%
training_time
)
# Validation feed
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
interim_val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
step
,
interim_val_xent
))
# Only chief worker can report intermediate metrics
nni
.
report_intermediate_result
(
interim_val_xent
)
if
step
>=
FLAGS
.
train_steps
:
break
time_end
=
time
.
time
()
print
(
"Training ends @ %f"
%
time_end
)
training_time
=
time_end
-
time_begin
print
(
"Training elapsed time: %f s"
%
training_time
)
# Validation feed
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
FLAGS
.
train_steps
,
val_xent
))
val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
FLAGS
.
train_steps
,
val_xent
))
# Only chief worker can report final metrics
if
is_chief
:
nni
.
report_final_result
(
val_xent
)
# Only chief worker can report final metrics
if
is_chief
:
nni
.
report_final_result
(
val_xent
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
tf
.
app
.
run
()
examples/trials/mnist-hyperband/mnist.py
View file @
21165b53
...
...
@@ -3,8 +3,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -142,13 +143,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
examples/trials/mnist/mnist.py
View file @
21165b53
...
...
@@ -4,8 +4,9 @@ import argparse
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -143,13 +144,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
examples/trials/mnist/mnist_before.py
View file @
21165b53
...
...
@@ -3,8 +3,9 @@ import argparse
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
FLAGS
=
None
...
...
@@ -143,13 +144,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
View file @
21165b53
...
...
@@ -57,8 +57,8 @@ class CurvefittingAssessor(Assessor):
self
.
threshold
=
threshold
# Record the number of gap
self
.
gap
=
gap
# Record the number of
times of
judgment
s
self
.
judgment_num
=
0
# Record the number of
intermediate result in the lastest
judgment
self
.
last_
judgment_num
=
dict
()
# Record the best performance
self
.
set_best_performance
=
False
self
.
completed_best_performance
=
None
...
...
@@ -112,9 +112,10 @@ class CurvefittingAssessor(Assessor):
curr_step
=
len
(
trial_history
)
if
curr_step
<
self
.
start_step
:
return
AssessResult
.
Good
if
(
curr_step
-
self
.
start_step
)
//
self
.
gap
<=
self
.
judgment_num
:
if
trial_job_id
in
self
.
last_judgment_num
.
keys
()
and
curr_step
-
self
.
last_judgment_num
[
trial_job_id
]
<
self
.
gap
:
return
AssessResult
.
Good
self
.
judgment_num
=
(
curr_step
-
self
.
start_step
)
//
self
.
gap
self
.
last_
judgment_num
[
trial_job_id
]
=
curr_step
try
:
start_time
=
datetime
.
datetime
.
now
()
...
...
src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
View file @
21165b53
...
...
@@ -37,13 +37,13 @@ def _outlierDetection_threaded(inputs):
sys
.
stderr
.
write
(
"[%s] DEBUG: Evaluating %dth of %d samples
\n
"
\
%
(
os
.
path
.
basename
(
__file__
),
samples_idx
+
1
,
len
(
samples_x
)))
outlier
=
None
# Create a diagnostic regression model which removes the sample that we want to evaluate
diagnostic_regressor_gp
=
gp_create_model
.
create
M
odel
(
\
diagnostic_regressor_gp
=
gp_create_model
.
create
_m
odel
(
\
samples_x
[
0
:
samples_idx
]
+
samples_x
[
samples_idx
+
1
:],
\
samples_y_aggregation
[
0
:
samples_idx
]
+
samples_y_aggregation
[
samples_idx
+
1
:])
mu
,
sigma
=
gp_prediction
.
predict
(
samples_x
[
samples_idx
],
diagnostic_regressor_gp
[
'model'
])
# 2.33 is the z-score for 98% confidence level
if
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
>
(
2.33
*
sigma
):
outlier
=
{
"samples_idx"
:
samples_idx
,
...
...
@@ -51,26 +51,26 @@ def _outlierDetection_threaded(inputs):
"expected_sigma"
:
sigma
,
"difference"
:
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
-
(
2.33
*
sigma
)}
return
outlier
def
outlierDetection_threaded
(
samples_x
,
samples_y_aggregation
):
'''
'''
Use Multi-thread to detect the outlier
'''
outliers
=
[]
threads_inputs
=
[[
samples_idx
,
samples_x
,
samples_y_aggregation
]
\
for
samples_idx
in
range
(
0
,
len
(
samples_x
))]
threads_pool
=
ThreadPool
(
min
(
4
,
len
(
threads_inputs
)))
threads_results
=
threads_pool
.
map
(
_outlierDetection_threaded
,
threads_inputs
)
threads_pool
.
close
()
threads_pool
.
join
()
for
threads_result
in
threads_results
:
if
threads_result
is
not
None
:
outliers
.
append
(
threads_result
)
else
:
print
(
"error here."
)
outliers
=
None
if
len
(
outliers
)
==
0
else
outliers
return
outliers
...
...
@@ -79,21 +79,19 @@ def outlierDetection(samples_x, samples_y_aggregation):
'''
outliers
=
[]
for
samples_idx
in
range
(
0
,
len
(
samples_x
)):
#sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
#sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
# \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
diagnostic_regressor_gp
=
gp_create_model
.
create
M
odel
(
\
diagnostic_regressor_gp
=
gp_create_model
.
create
_m
odel
(
\
samples_x
[
0
:
samples_idx
]
+
samples_x
[
samples_idx
+
1
:],
\
samples_y_aggregation
[
0
:
samples_idx
]
+
samples_y_aggregation
[
samples_idx
+
1
:])
mu
,
sigma
=
gp_prediction
.
predict
(
samples_x
[
samples_idx
],
diagnostic_regressor_gp
[
'model'
])
diagnostic_regressor_gp
[
'model'
])
# 2.33 is the z-score for 98% confidence level
if
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
>
(
2.33
*
sigma
):
outliers
.
append
({
"samples_idx"
:
samples_idx
,
"expected_mu"
:
mu
,
"expected_sigma"
:
sigma
,
"difference"
:
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
-
(
2.33
*
sigma
)})
outliers
=
None
if
len
(
outliers
)
==
0
else
outliers
return
outliers
\ No newline at end of file
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
View file @
21165b53
...
...
@@ -24,22 +24,20 @@ import os
import
random
import
statistics
import
sys
import
numpy
as
np
from
enum
import
Enum
,
unique
from
multiprocessing.dummy
import
Pool
as
ThreadPool
from
nni.tuner
import
Tuner
import
numpy
as
np
import
nni.metis_tuner.lib_data
as
lib_data
import
nni.metis_tuner.lib_constraint_summation
as
lib_constraint_summation
import
nni.metis_tuner.Regression_GP.CreateModel
as
gp_create_model
import
nni.metis_tuner.Regression_GP.Selection
as
gp_selection
import
nni.metis_tuner.Regression_GP.Prediction
as
gp_prediction
import
nni.metis_tuner.Regression_GP.OutlierDetection
as
gp_outlier_detection
import
nni.metis_tuner.lib_data
as
lib_data
import
nni.metis_tuner.Regression_GMM.CreateModel
as
gmm_create_model
import
nni.metis_tuner.Regression_GMM.Selection
as
gmm_selection
import
nni.metis_tuner.Regression_GP.CreateModel
as
gp_create_model
import
nni.metis_tuner.Regression_GP.OutlierDetection
as
gp_outlier_detection
import
nni.metis_tuner.Regression_GP.Prediction
as
gp_prediction
import
nni.metis_tuner.Regression_GP.Selection
as
gp_selection
from
nni.tuner
import
Tuner
logger
=
logging
.
getLogger
(
"Metis_Tuner_AutoML"
)
...
...
@@ -67,33 +65,37 @@ class MetisTuner(Tuner):
"""
def
__init__
(
self
,
optimize_mode
=
"maximize"
,
no_resampling
=
True
,
no_candidates
=
True
,
selection_num_starting_points
=
1
0
,
cold_start_num
=
10
):
selection_num_starting_points
=
60
0
,
cold_start_num
=
10
,
exploration_probability
=
0.1
):
"""
Parameters
----------
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False. Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free, then you do not need re-sampling.
no_candidates: bool
True or False. Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks, Metis can skip this step.
selection_num_starting_points: int
how many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num: int
Metis need some trial result to get cold start. when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability: float
The probability of Metis to select parameter from exploration instead of exploitation.
"""
self
.
samples_x
=
[]
self
.
samples_y
=
[]
self
.
samples_y_aggregation
=
[]
self
.
history_parameters
=
[]
self
.
space
=
None
self
.
no_resampling
=
no_resampling
self
.
no_candidates
=
no_candidates
...
...
@@ -101,6 +103,7 @@ class MetisTuner(Tuner):
self
.
key_order
=
[]
self
.
cold_start_num
=
cold_start_num
self
.
selection_num_starting_points
=
selection_num_starting_points
self
.
exploration_probability
=
exploration_probability
self
.
minimize_constraints_fun
=
None
self
.
minimize_starting_points
=
None
...
...
@@ -128,7 +131,7 @@ class MetisTuner(Tuner):
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
RuntimeError
(
"The format search space contains
\
some key that didn't define in key_order."
)
some key that didn't define in key_order."
)
if
key_type
==
'quniform'
:
if
key_range
[
2
]
==
1
:
...
...
@@ -191,7 +194,7 @@ class MetisTuner(Tuner):
Parameters
----------
parameter_id : int
Returns
-------
result : dict
...
...
@@ -200,13 +203,15 @@ class MetisTuner(Tuner):
init_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
results
=
self
.
_pack_output
(
init_parameter
)
else
:
self
.
minimize_starting_points
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
\
self
.
selection_num_starting_points
)
results
=
self
.
_selection
(
self
.
samples_x
,
self
.
samples_y_aggregation
,
self
.
samples_y
,
self
.
x_bounds
,
self
.
x_types
,
threshold_samplessize_resampling
=
(
None
if
self
.
no_resampling
is
True
else
50
),
no_candidates
=
self
.
no_candidates
,
minimize_starting_points
=
self
.
minimize_starting_points
,
minimize_constraints_fun
=
self
.
minimize_constraints_fun
)
logger
.
info
(
"Generate paramageters:
\n
"
+
str
(
results
))
return
results
...
...
@@ -245,7 +250,7 @@ class MetisTuner(Tuner):
# calculate y aggregation
median
=
get_median
(
temp_y
)
self
.
samples_y_aggregation
[
idx
]
=
median
self
.
samples_y_aggregation
[
idx
]
=
[
median
]
else
:
self
.
samples_x
.
append
(
sample_x
)
self
.
samples_y
.
append
([
value
])
...
...
@@ -264,17 +269,21 @@ class MetisTuner(Tuner):
candidates
=
[]
samples_size_all
=
sum
([
len
(
i
)
for
i
in
samples_y
])
samples_size_unique
=
len
(
samples_y
)
# ===== STEP 1: Compute the current optimum =====
#sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
gp_model
=
gp_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
lm_current
=
gp_selection
.
selection
(
"lm"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
lm_current
=
gp_selection
.
selection
(
"lm"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
not
lm_current
:
return
None
if
no_candidates
is
False
:
candidates
.
append
({
'hyperparameter'
:
lm_current
[
'hyperparameter'
],
'expected_mu'
:
lm_current
[
'expected_mu'
],
...
...
@@ -284,10 +293,14 @@ class MetisTuner(Tuner):
# ===== STEP 2: Get recommended configurations for exploration =====
#sys.stderr.write("[%s] Getting candidates for exploration...\n"
#% \(os.path.basename(__file__)))
results_exploration
=
gp_selection
.
selection
(
"lc"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
results_exploration
=
gp_selection
.
selection
(
"lc"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
results_exploration
is
not
None
:
if
_num_past_samples
(
results_exploration
[
'hyperparameter'
],
samples_x
,
samples_y
)
==
0
:
...
...
@@ -308,12 +321,13 @@ class MetisTuner(Tuner):
print
(
"Getting candidates for exploitation...
\n
"
)
try
:
gmm
=
gmm_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
results_exploitation
=
gmm_selection
.
selection
(
x_bounds
,
x_types
,
gmm
[
'clusteringmodel_good'
],
gmm
[
'clusteringmodel_bad'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
results_exploitation
=
gmm_selection
.
selection
(
x_bounds
,
x_types
,
gmm
[
'clusteringmodel_good'
],
gmm
[
'clusteringmodel_bad'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
results_exploitation
is
not
None
:
if
_num_past_samples
(
results_exploitation
[
'hyperparameter'
],
samples_x
,
samples_y
)
==
0
:
...
...
@@ -326,9 +340,9 @@ class MetisTuner(Tuner):
logger
.
info
(
"DEBUG: No suitable exploitation_gmm candidates were found
\n
"
)
except
ValueError
as
exception
:
# The exception: ValueError: Fitting the mixture model failed
# because some components have ill-defined empirical covariance
# (for instance caused by singleton or collapsed samples).
# The exception: ValueError: Fitting the mixture model failed
# because some components have ill-defined empirical covariance
# (for instance caused by singleton or collapsed samples).
# Try to decrease the number of components, or increase reg_covar.
logger
.
info
(
"DEBUG: No suitable exploitation_gmm candidates were found due to exception."
)
logger
.
info
(
exception
)
...
...
@@ -340,8 +354,6 @@ class MetisTuner(Tuner):
results_outliers
=
gp_outlier_detection
.
outlierDetection_threaded
(
samples_x
,
samples_y_aggregation
)
if
results_outliers
is
not
None
:
#temp = len(candidates)
for
results_outlier
in
results_outliers
:
if
_num_past_samples
(
samples_x
[
results_outlier
[
'samples_idx'
]],
samples_x
,
samples_y
)
<
max_resampling_per_x
:
candidates
.
append
({
'hyperparameter'
:
samples_x
[
results_outlier
[
'samples_idx'
]],
\
...
...
@@ -357,7 +369,10 @@ class MetisTuner(Tuner):
logger
.
info
(
"Evaluating information gain of %d candidates...
\n
"
)
next_improvement
=
0
threads_inputs
=
[[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
for
candidate
in
candidates
]
threads_inputs
=
[[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
for
candidate
in
candidates
]
threads_pool
=
ThreadPool
(
4
)
# Evaluate what would happen if we actually sample each candidate
threads_results
=
threads_pool
.
map
(
_calculate_lowest_mu_threaded
,
threads_inputs
)
...
...
@@ -368,21 +383,23 @@ class MetisTuner(Tuner):
if
threads_result
[
'expected_lowest_mu'
]
<
lm_current
[
'expected_mu'
]:
# Information gain
temp_improvement
=
threads_result
[
'expected_lowest_mu'
]
-
lm_current
[
'expected_mu'
]
if
next_improvement
>
temp_improvement
:
#
logger.info("DEBUG: \"next_candidate\" changed: \
#
lowest mu might reduce from %f (%s) to %f (%s), %s\n" %\
#
lm_current['expected_mu'], str(lm_current['hyperparameter']),\
#
threads_result['expected_lowest_mu'],\
#
str(threads_result['candidate']['hyperparameter']),\
#
threads_result['candidate']['reason'])
logger
.
info
(
"DEBUG:
\"
next_candidate
\"
changed:
\
lowest mu might reduce from %f (%s) to %f (%s), %s
\n
"
%
\
lm_current
[
'expected_mu'
],
str
(
lm_current
[
'hyperparameter'
]),
\
threads_result
[
'expected_lowest_mu'
],
\
str
(
threads_result
[
'candidate'
][
'hyperparameter'
]),
\
threads_result
[
'candidate'
][
'reason'
])
next_improvement
=
temp_improvement
next_candidate
=
threads_result
[
'candidate'
]
else
:
# ===== STEP 6: If we have no candidates, randomly pick one =====
logger
.
info
(
"DEBUG: No candidates from exploration, exploitation,
\
and resampling. We will random a candidate for next_candidate
\n
"
)
logger
.
info
(
"DEBUG: No candidates from exploration, exploitation,
\
and resampling. We will random a candidate for next_candidate
\n
"
)
next_candidate
=
_rand_with_constraints
(
x_bounds
,
x_types
)
\
if
minimize_starting_points
is
None
else
minimize_starting_points
[
0
]
...
...
@@ -391,7 +408,16 @@ class MetisTuner(Tuner):
next_candidate
=
{
'hyperparameter'
:
next_candidate
,
'reason'
:
"random"
,
'expected_mu'
:
expected_mu
,
'expected_sigma'
:
expected_sigma
}
# ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step =====
outputs
=
self
.
_pack_output
(
lm_current
[
'hyperparameter'
])
ap
=
random
.
uniform
(
0
,
1
)
if
outputs
in
self
.
history_parameters
or
ap
<=
self
.
exploration_probability
:
if
next_candidate
is
not
None
:
outputs
=
self
.
_pack_output
(
next_candidate
[
'hyperparameter'
])
else
:
random_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
outputs
=
self
.
_pack_output
(
random_parameter
)
self
.
history_parameters
.
append
(
outputs
)
return
outputs
...
...
@@ -437,10 +463,14 @@ def _calculate_lowest_mu_threaded(inputs):
# Aggregates multiple observation of the sample sampling points
temp_y_aggregation
=
[
statistics
.
median
(
temp_sample_y
)
for
temp_sample_y
in
temp_samples_y
]
temp_gp
=
gp_create_model
.
create_model
(
temp_samples_x
,
temp_y_aggregation
)
temp_results
=
gp_selection
.
selection
(
"lm"
,
temp_y_aggregation
,
x_bounds
,
x_types
,
temp_gp
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
temp_results
=
gp_selection
.
selection
(
"lm"
,
temp_y_aggregation
,
x_bounds
,
x_types
,
temp_gp
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
outputs
[
"expected_lowest_mu"
]
is
None
or
outputs
[
"expected_lowest_mu"
]
>
temp_results
[
'expected_mu'
]:
outputs
[
"expected_lowest_mu"
]
=
temp_results
[
'expected_mu'
]
...
...
src/webui/src/components/TrialsDetail.tsx
View file @
21165b53
...
...
@@ -92,7 +92,7 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
formatter
:
function
(
data
:
TooltipForAccuracy
)
{
const
result
=
'
<div class="tooldetailAccuracy">
'
+
'
<div>Trial No:
'
+
data
.
data
[
0
]
+
'
</div>
'
+
'
<div>Default Metrc:
'
+
data
.
data
[
1
]
+
'
</div>
'
+
'
<div>Default Metr
i
c:
'
+
data
.
data
[
1
]
+
'
</div>
'
+
'
<div>Parameters:
'
+
'
<pre>
'
+
JSON
.
stringify
(
data
.
data
[
2
],
null
,
4
)
+
'
</pre>
'
+
'
</div>
'
+
...
...
test/config_test/examples/mnist-annotation.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
2
trialConcurrency
:
1
maxTrialNum
:
4
trialConcurrency
:
2
tuner
:
builtinTunerName
:
Random
...
...
test/config_test/examples/mnist.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
2
trialConcurrency
:
1
maxTrialNum
:
4
trialConcurrency
:
2
searchSpacePath
:
./mnist_search_space.json
tuner
:
...
...
test/config_test/multi_phase/multi_phase.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
16
trialConcurrency
:
8
maxTrialNum
:
8
trialConcurrency
:
4
searchSpacePath
:
./search_space.json
tuner
:
...
...
test/pipelines-it-kubeflow.yml
View file @
21165b53
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jobs
:
-
job
:
'
integration_test_kubeflow'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
KUBE
CLI'
variables
:
new_docker_img
:
msranni/nni.it.kb:latest
steps
:
-
script
:
python3 -m pip install --upgrade pip setuptools --user
...
...
@@ -18,20 +34,6 @@ jobs:
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
nni
bdsit_wheel'
-
script
:
|
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
and
upload
nni
docker
image'
-
script
:
|
source install.sh
displayName
:
'
Install
nni
toolkit
via
source
code'
...
...
@@ -39,7 +41,18 @@ jobs:
-
script
:
|
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
...
...
test/pipelines-it-pai.yml
View file @
21165b53
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jobs
:
-
job
:
'
integration_test_pai'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
PAI
CLI'
variables
:
new_docker_img
:
msranni/nni.it.pai:latest
steps
:
-
script
:
python3 -m pip install --upgrade pip setuptools --user
...
...
@@ -18,20 +34,6 @@ jobs:
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
nni
bdsit_wheel'
-
script
:
|
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
and
upload
nni
docker
image'
-
script
:
|
source install.sh
displayName
:
'
Install
nni
toolkit
via
source
code'
...
...
@@ -39,10 +41,24 @@ jobs:
-
script
:
|
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
echo 'build and upload docker image'
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
...
...
tools/nni_annotation/README_zh_CN.md
View file @
21165b53
# NNI Annotation
介绍
# NNI Annotation
为了获得良好的用户体验并减少用户负担,NNI 设计了通过注释来使用的语法。
## 概述
使用 NNI 时,只需要:
为了获得良好的用户体验并减少对以后代码的影响,NNI 设计了通过 Annotation(标记)来使用的语法。 通过 Annotation,只需要在代码中加入一些注释字符串,就能启用 NNI,完全不影响代码原先的执行逻辑。
1.
在超参变量前加上如下标记:
'''@nni.variable(nni.choice(2,3,5,7),name=self.conv_size)'''
样例如下:
2.
在中间结果前加上:
'''@nni.report_intermediate_result(test_acc)'''
```
python
'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
learning_rate
=
0.1
```
3.
在输出结果前加上:
'''@nni.report_final_result(test_acc)'''
此样例中,NNI 会从 (0.1, 0.01, 0.001) 中选择一个值赋给 learning_rate 变量。 第一行就是 NNI 的 Annotation,是 Python 中的一个字符串。 接下来的一行需要是赋值语句。 NNI 会根据 Annotation 行的信息,来给这一行的变量赋上相应的值。
4.
在代码中使用函数
`function_choice`
:
'''@nni.function_choice(max_pool(h_conv1, self.pool_size),avg_pool(h_conv1, self.pool_size),name=max_pool)'''
通过这种方式,不需要修改任何代码,代码既可以直接运行,又可以使用 NNI 来调参。
通过这种方法,能够轻松的在 NNI 中实现自动调参。
## Annotation 的类型:
`@nni.variable`
,
`nni.choice`
为搜索空间的类型,通过以下 10 种方法来定义搜索空间:
NNI 中,有 4 种类型的 Annotation;
1.
`@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)`
变量值是选项中的一种,这些变量可以是任意的表达式。
### 1. 变量
2.
`@nni.variable(nni.randint(upper),name=variable)`
变量可以是范围 [0, upper) 中的任意整数。
`'''@nni.variable(sampling_algo, name)'''`
3.
`@nni.variable(nni.uniform(low, high),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值。
`@nni.variable`
用来标记变量。
4.
`@nni.variable(nni.quniform(low, high, q),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(uniform(low, high) / q)
*
q
**参数**
5.
`@nni.variable(nni.loguniform(low, high),name=variable)`
变量值是 exp(uniform(low, high)) 的点,数值以对数均匀分布
。
-
**sampling_algo**
: 指定搜索空间的采样算法。 可将其换成 NNI 支持的其它采样函数,函数要以
`nni.`
开头。例如,
`choice`
或
`uniform`
,详见
[
SearchSpaceSpec
](
https://nni.readthedocs.io/zh/latest/SearchSpaceSpec.html
)
。
-
**name**
: 将被赋值的变量名称。 注意,此参数应该与下面一行等号左边的值相同
。
6.
`@nni.variable(nni.qloguniform(low, high, q),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(exp(uniform(low, high)) / q)
*
q
NNI 支持如下 10 种类型来表示搜索空间:
7.
`@nni.variable(nni.normal(label, mu, sigma),name=variable)`
变量值为正态分布的实数值,平均值为 mu,标准方差为 sigma。
-
`@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)`
变量值是选项中的一种,这些变量可以是任意的表达式。
-
`@nni.variable(nni.randint(upper),name=variable)`
变量可以是范围
[
0, upper) 中的任意整数。
- `@nni.variable(nni.uniform(low, high),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值。
- `@nni.variable(nni.quniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(uniform(low, high) / q) * q
- `@nni.variable(nni.loguniform(low, high),name=variable)` 变量值是 exp(uniform(low, high)) 的点,数值以对数均匀分布。
- `@nni.variable(nni.qloguniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(exp(uniform(low, high)) / q) * q
- `@nni.variable(nni.normal(mu, sigma),name=variable)` 变量值为正态分布的实数值,平均值为 mu,标准方差为 sigma。
- `@nni.variable(nni.qnormal(mu, sigma, q),name=variable)` 变量值分布的公式为: round(normal(mu, sigma) / q) * q
- `@nni.variable(nni.lognormal(mu, sigma),name=variable)` 变量值分布的公式为: exp(normal(mu, sigma))
- `@nni.variable(nni.qlognormal(mu, sigma, q),name=variable)` 变量值分布的公式为: round(exp(normal(mu, sigma)) / q) * q
8.
`@nni.variable(nni.qnormal(label, mu, sigma, q),name=variable)`
变量值分布的公式为: round(normal(mu, sigma) / q)
*
q
样例如下:
9.
`@nni.variable(nni.lognormal(label, mu, sigma),name=variable)`
变量值分布的公式为: exp(normal(mu, sigma))
```python
'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
learning_rate = 0.1
```
10.
`@nni.variable(nni.qlognormal(label, mu, sigma, q),name=variable)`
变量值分布的公式为: round(exp(normal(mu, sigma)) / q)
*
q
\ No newline at end of file
### 2. 函数
`'''@nni.function_choice(*functions, name)'''`
`@nni.function_choice` 可以从几个函数中选择一个来执行。
**参数**
- **functions**: 可选择的函数。 注意,必须是包括参数的完整函数调用。 例如 `max_pool(hidden_layer, pool_size)`。
- **name**: 将被替换的函数名称。
例如:
```python
"""@nni.function_choice(max_pool(hidden_layer, pool_size), avg_pool(hidden_layer, pool_size), name=max_pool)"""
h_pooling = max_pool(hidden_layer, pool_size)
```
### 3. 中间结果
`'''@nni.report_intermediate_result(metrics)'''`
`@nni.report_intermediate_result` 用来返回中间结果,这和 [Trials.md
](
https://nni.readthedocs.io/zh/latest/Trials.html
)
中的
`nni.report_intermediate_result`
用法一样。
### 4. 最终结果
`'''@nni.report_final_result(metrics)'''`
`@nni.report_final_result`
用来返回当前 Trial 的最终结果,这和
[
Trials.md
](
https://nni.readthedocs.io/zh/latest/Trials.html
)
中的
`nni.report_final_result`
用法一样。
\ No newline at end of file
tools/nni_annotation/examples/mnist_generated.py
View file @
21165b53
import
nni
"""A deep MNIST classifier using convolutional layers."""
import
logging
import
math
import
tempfile
import
time
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
FLAGS
=
None
logger
=
logging
.
getLogger
(
'mnist_AutoML'
)
...
...
@@ -123,12 +127,23 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
"""
Main function, build mnist network, run and send result to NNI.
"""
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
],
one_hot
=
True
)
def
main
(
params
):
# Import data
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
mnist_network
=
MnistNetwork
(
channel_1_num
=
params
[
'channel_1_num'
],
...
...
tools/nni_annotation/examples/mnist_with_annotation.py
View file @
21165b53
...
...
@@ -21,8 +21,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
FLAGS
=
None
...
...
@@ -168,13 +169,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
tools/nni_annotation/examples/mnist_without_annotation.py
View file @
21165b53
...
...
@@ -21,8 +21,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -172,13 +173,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment