Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
21165b53
Unverified
Commit
21165b53
authored
Mar 07, 2019
by
SparkSnail
Committed by
GitHub
Mar 07, 2019
Browse files
Merge pull request #138 from Microsoft/master
merge master
parents
41a9a598
f10c3311
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
573 additions
and
407 deletions
+573
-407
examples/trials/mnist-distributed/dist_mnist.py
examples/trials/mnist-distributed/dist_mnist.py
+264
-242
examples/trials/mnist-hyperband/mnist.py
examples/trials/mnist-hyperband/mnist.py
+11
-2
examples/trials/mnist/mnist.py
examples/trials/mnist/mnist.py
+11
-2
examples/trials/mnist/mnist_before.py
examples/trials/mnist/mnist_before.py
+11
-2
src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
.../pynni/nni/curvefitting_assessor/curvefitting_assessor.py
+5
-4
src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
...k/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
+12
-14
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+83
-53
src/webui/src/components/TrialsDetail.tsx
src/webui/src/components/TrialsDetail.tsx
+1
-1
test/config_test/examples/mnist-annotation.test.yml
test/config_test/examples/mnist-annotation.test.yml
+2
-2
test/config_test/examples/mnist.test.yml
test/config_test/examples/mnist.test.yml
+2
-2
test/config_test/multi_phase/multi_phase.test.yml
test/config_test/multi_phase/multi_phase.test.yml
+2
-2
test/pipelines-it-kubeflow.yml
test/pipelines-it-kubeflow.yml
+32
-19
test/pipelines-it-pai.yml
test/pipelines-it-pai.yml
+35
-19
tools/nni_annotation/README_zh_CN.md
tools/nni_annotation/README_zh_CN.md
+63
-37
tools/nni_annotation/examples/mnist_generated.py
tools/nni_annotation/examples/mnist_generated.py
+17
-2
tools/nni_annotation/examples/mnist_with_annotation.py
tools/nni_annotation/examples/mnist_with_annotation.py
+11
-2
tools/nni_annotation/examples/mnist_without_annotation.py
tools/nni_annotation/examples/mnist_without_annotation.py
+11
-2
No files found.
examples/trials/mnist-distributed/dist_mnist.py
View file @
21165b53
...
...
@@ -13,10 +13,10 @@
# limitations under the License.
# ==============================================================================
#
# NNI (https://github.com/Microsoft/nni) modified this code to show how to
#
# NNI (https://github.com/Microsoft/nni) modified this code to show how to
# integrate distributed tensorflow training with NNI SDK
#
#
"""Distributed MNIST training and validation, with model replicas.
A simple softmax model with one hidden layer is defined. The parameters
...
...
@@ -54,19 +54,22 @@ import nni
flags
=
tf
.
app
.
flags
flags
.
DEFINE_string
(
"data_dir"
,
"/tmp/mnist-data"
,
"Directory for storing mnist data"
)
flags
.
DEFINE_boolean
(
"download_only"
,
False
,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training"
)
flags
.
DEFINE_integer
(
"task_index"
,
None
,
"Worker task index, should be >= 0. task_index=0 is "
"the master worker task the performs the variable "
"initialization "
)
flags
.
DEFINE_integer
(
"num_gpus"
,
1
,
"Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'"
)
flags
.
DEFINE_integer
(
"replicas_to_aggregate"
,
None
,
"Number of replicas to aggregate before parameter update"
"is applied (For sync_replicas mode only; default: "
"num_workers)"
)
flags
.
DEFINE_boolean
(
"download_only"
,
False
,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training"
)
flags
.
DEFINE_integer
(
"task_index"
,
None
,
"Worker task index, should be >= 0. task_index=0 is "
"the master worker task the performs the variable "
"initialization "
)
flags
.
DEFINE_integer
(
"num_gpus"
,
1
,
"Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'"
)
flags
.
DEFINE_integer
(
"replicas_to_aggregate"
,
None
,
"Number of replicas to aggregate before parameter update"
"is applied (For sync_replicas mode only; default: "
"num_workers)"
)
flags
.
DEFINE_integer
(
"train_steps"
,
20000
,
"Number of (global) training steps to perform"
)
flags
.
DEFINE_boolean
(
...
...
@@ -96,237 +99,256 @@ IMAGE_PIXELS = 28
# {'cluster': cluster,
# 'task': {'type': 'worker', 'index': 1}})
def
generate_default_params
():
'''
Generate default hyper parameters
'''
return
{
'learning_rate'
:
0.01
,
'batch_size'
:
100
,
'hidden_units'
:
100
,
}
'''
Generate default hyper parameters
'''
return
{
'learning_rate'
:
0.01
,
'batch_size'
:
100
,
'hidden_units'
:
100
,
}
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
unused_argv
):
# Receive NNI hyper parameter and update it onto default params
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
PARAMS
=
generate_default_params
()
PARAMS
.
update
(
RECEIVED_PARAMS
)
# Parse environment variable TF_CONFIG to get job_name and task_index
# If not explicitly specified in the constructor and the TF_CONFIG
# environment variable is present, load cluster_spec from TF_CONFIG.
tf_config
=
json
.
loads
(
os
.
environ
.
get
(
'TF_CONFIG'
)
or
'{}'
)
task_config
=
tf_config
.
get
(
'task'
,
{})
task_type
=
task_config
.
get
(
'type'
)
task_index
=
task_config
.
get
(
'index'
)
FLAGS
.
job_name
=
task_type
FLAGS
.
task_index
=
task_index
mnist
=
input_data
.
read_data_sets
(
FLAGS
.
data_dir
,
one_hot
=
True
)
if
FLAGS
.
download_only
:
sys
.
exit
(
0
)
if
FLAGS
.
job_name
is
None
or
FLAGS
.
job_name
==
""
:
raise
ValueError
(
"Must specify an explicit `job_name`"
)
if
FLAGS
.
task_index
is
None
or
FLAGS
.
task_index
==
""
:
raise
ValueError
(
"Must specify an explicit `task_index`"
)
print
(
"job name = %s"
%
FLAGS
.
job_name
)
print
(
"task index = %d"
%
FLAGS
.
task_index
)
cluster_config
=
tf_config
.
get
(
'cluster'
,
{})
ps_hosts
=
cluster_config
.
get
(
'ps'
)
worker_hosts
=
cluster_config
.
get
(
'worker'
)
ps_hosts_str
=
','
.
join
(
ps_hosts
)
worker_hosts_str
=
','
.
join
(
worker_hosts
)
FLAGS
.
ps_hosts
=
ps_hosts_str
FLAGS
.
worker_hosts
=
worker_hosts_str
# Construct the cluster and start the server
ps_spec
=
FLAGS
.
ps_hosts
.
split
(
","
)
worker_spec
=
FLAGS
.
worker_hosts
.
split
(
","
)
# Get the number of workers.
num_workers
=
len
(
worker_spec
)
cluster
=
tf
.
train
.
ClusterSpec
({
"ps"
:
ps_spec
,
"worker"
:
worker_spec
})
if
not
FLAGS
.
existing_servers
:
# Not using existing servers. Create an in-process server.
server
=
tf
.
train
.
Server
(
cluster
,
job_name
=
FLAGS
.
job_name
,
task_index
=
FLAGS
.
task_index
)
if
FLAGS
.
job_name
==
"ps"
:
server
.
join
()
is_chief
=
(
FLAGS
.
task_index
==
0
)
if
FLAGS
.
num_gpus
>
0
:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu
=
(
FLAGS
.
task_index
%
FLAGS
.
num_gpus
)
worker_device
=
"/job:worker/task:%d/gpu:%d"
%
(
FLAGS
.
task_index
,
gpu
)
elif
FLAGS
.
num_gpus
==
0
:
# Just allocate the CPU to worker server
cpu
=
0
worker_device
=
"/job:worker/task:%d/cpu:%d"
%
(
FLAGS
.
task_index
,
cpu
)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with
tf
.
device
(
tf
.
train
.
replica_device_setter
(
worker_device
=
worker_device
,
ps_device
=
"/job:ps/cpu:0"
,
cluster
=
cluster
)):
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
# Variables of the hidden layer
hid_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
IMAGE_PIXELS
*
IMAGE_PIXELS
,
PARAMS
[
'hidden_units'
]],
stddev
=
1.0
/
IMAGE_PIXELS
),
name
=
"hid_w"
)
hid_b
=
tf
.
Variable
(
tf
.
zeros
([
PARAMS
[
'hidden_units'
]]),
name
=
"hid_b"
)
# Variables of the softmax layer
sm_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
PARAMS
[
'hidden_units'
],
10
],
stddev
=
1.0
/
math
.
sqrt
(
PARAMS
[
'hidden_units'
])),
name
=
"sm_w"
)
sm_b
=
tf
.
Variable
(
tf
.
zeros
([
10
]),
name
=
"sm_b"
)
# Ops: located on the worker specified with FLAGS.task_index
x
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
IMAGE_PIXELS
*
IMAGE_PIXELS
])
y_
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
10
])
hid_lin
=
tf
.
nn
.
xw_plus_b
(
x
,
hid_w
,
hid_b
)
hid
=
tf
.
nn
.
relu
(
hid_lin
)
y
=
tf
.
nn
.
softmax
(
tf
.
nn
.
xw_plus_b
(
hid
,
sm_w
,
sm_b
))
cross_entropy
=
-
tf
.
reduce_sum
(
y_
*
tf
.
log
(
tf
.
clip_by_value
(
y
,
1e-10
,
1.0
)))
opt
=
tf
.
train
.
AdamOptimizer
(
PARAMS
[
'learning_rate'
])
if
FLAGS
.
sync_replicas
:
if
FLAGS
.
replicas_to_aggregate
is
None
:
replicas_to_aggregate
=
num_workers
else
:
replicas_to_aggregate
=
FLAGS
.
replicas_to_aggregate
opt
=
tf
.
train
.
SyncReplicasOptimizer
(
opt
,
replicas_to_aggregate
=
replicas_to_aggregate
,
total_num_replicas
=
num_workers
,
name
=
"mnist_sync_replicas"
)
train_step
=
opt
.
minimize
(
cross_entropy
,
global_step
=
global_step
)
if
FLAGS
.
sync_replicas
:
local_init_op
=
opt
.
local_step_init_op
if
is_chief
:
local_init_op
=
opt
.
chief_init_op
ready_for_local_init_op
=
opt
.
ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner
=
opt
.
get_chief_queue_runner
()
sync_init_op
=
opt
.
get_init_tokens_op
()
init_op
=
tf
.
global_variables_initializer
()
train_dir
=
tempfile
.
mkdtemp
()
if
FLAGS
.
sync_replicas
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
local_init_op
=
local_init_op
,
ready_for_local_init_op
=
ready_for_local_init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
else
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
sess_config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
,
log_device_placement
=
False
,
device_filters
=
[
"/job:ps"
,
"/job:worker/task:%d"
%
FLAGS
.
task_index
])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if
is_chief
:
print
(
"Worker %d: Initializing session..."
%
FLAGS
.
task_index
)
else
:
print
(
"Worker %d: Waiting for session to be initialized..."
%
FLAGS
.
task_index
)
if
FLAGS
.
existing_servers
:
server_grpc_url
=
"grpc://"
+
worker_spec
[
FLAGS
.
task_index
]
print
(
"Using existing server at: %s"
%
server_grpc_url
)
sess
=
sv
.
prepare_or_wait_for_session
(
server_grpc_url
,
config
=
sess_config
)
else
:
sess
=
sv
.
prepare_or_wait_for_session
(
server
.
target
,
config
=
sess_config
)
print
(
"Worker %d: Session initialization complete."
%
FLAGS
.
task_index
)
if
FLAGS
.
sync_replicas
and
is_chief
:
# Chief worker will start the chief queue runner and call the init op.
sess
.
run
(
sync_init_op
)
sv
.
start_queue_runners
(
sess
,
[
chief_queue_runner
])
# Perform training
time_begin
=
time
.
time
()
print
(
"Training begins @ %f"
%
time_begin
)
local_step
=
0
while
True
:
# Training feed
batch_xs
,
batch_ys
=
mnist
.
train
.
next_batch
(
PARAMS
[
'batch_size'
])
train_feed
=
{
x
:
batch_xs
,
y_
:
batch_ys
}
_
,
step
=
sess
.
run
([
train_step
,
global_step
],
feed_dict
=
train_feed
)
local_step
+=
1
now
=
time
.
time
()
print
(
"%f: Worker %d: training step %d done (global step: %d)"
%
(
now
,
FLAGS
.
task_index
,
local_step
,
step
))
if
step
>
0
and
step
%
5000
==
0
and
is_chief
:
# Receive NNI hyper parameter and update it onto default params
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
PARAMS
=
generate_default_params
()
PARAMS
.
update
(
RECEIVED_PARAMS
)
# Parse environment variable TF_CONFIG to get job_name and task_index
# If not explicitly specified in the constructor and the TF_CONFIG
# environment variable is present, load cluster_spec from TF_CONFIG.
tf_config
=
json
.
loads
(
os
.
environ
.
get
(
'TF_CONFIG'
)
or
'{}'
)
task_config
=
tf_config
.
get
(
'task'
,
{})
task_type
=
task_config
.
get
(
'type'
)
task_index
=
task_config
.
get
(
'index'
)
FLAGS
.
job_name
=
task_type
FLAGS
.
task_index
=
task_index
mnist
=
download_mnist_retry
(
FLAGS
.
data_dir
)
if
FLAGS
.
download_only
:
sys
.
exit
(
0
)
if
FLAGS
.
job_name
is
None
or
FLAGS
.
job_name
==
""
:
raise
ValueError
(
"Must specify an explicit `job_name`"
)
if
FLAGS
.
task_index
is
None
or
FLAGS
.
task_index
==
""
:
raise
ValueError
(
"Must specify an explicit `task_index`"
)
print
(
"job name = %s"
%
FLAGS
.
job_name
)
print
(
"task index = %d"
%
FLAGS
.
task_index
)
cluster_config
=
tf_config
.
get
(
'cluster'
,
{})
ps_hosts
=
cluster_config
.
get
(
'ps'
)
worker_hosts
=
cluster_config
.
get
(
'worker'
)
ps_hosts_str
=
','
.
join
(
ps_hosts
)
worker_hosts_str
=
','
.
join
(
worker_hosts
)
FLAGS
.
ps_hosts
=
ps_hosts_str
FLAGS
.
worker_hosts
=
worker_hosts_str
# Construct the cluster and start the server
ps_spec
=
FLAGS
.
ps_hosts
.
split
(
","
)
worker_spec
=
FLAGS
.
worker_hosts
.
split
(
","
)
# Get the number of workers.
num_workers
=
len
(
worker_spec
)
cluster
=
tf
.
train
.
ClusterSpec
({
"ps"
:
ps_spec
,
"worker"
:
worker_spec
})
if
not
FLAGS
.
existing_servers
:
# Not using existing servers. Create an in-process server.
server
=
tf
.
train
.
Server
(
cluster
,
job_name
=
FLAGS
.
job_name
,
task_index
=
FLAGS
.
task_index
)
if
FLAGS
.
job_name
==
"ps"
:
server
.
join
()
is_chief
=
(
FLAGS
.
task_index
==
0
)
if
FLAGS
.
num_gpus
>
0
:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu
=
(
FLAGS
.
task_index
%
FLAGS
.
num_gpus
)
worker_device
=
"/job:worker/task:%d/gpu:%d"
%
(
FLAGS
.
task_index
,
gpu
)
elif
FLAGS
.
num_gpus
==
0
:
# Just allocate the CPU to worker server
cpu
=
0
worker_device
=
"/job:worker/task:%d/cpu:%d"
%
(
FLAGS
.
task_index
,
cpu
)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with
tf
.
device
(
tf
.
train
.
replica_device_setter
(
worker_device
=
worker_device
,
ps_device
=
"/job:ps/cpu:0"
,
cluster
=
cluster
)):
global_step
=
tf
.
Variable
(
0
,
name
=
"global_step"
,
trainable
=
False
)
# Variables of the hidden layer
hid_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
IMAGE_PIXELS
*
IMAGE_PIXELS
,
PARAMS
[
'hidden_units'
]],
stddev
=
1.0
/
IMAGE_PIXELS
),
name
=
"hid_w"
)
hid_b
=
tf
.
Variable
(
tf
.
zeros
([
PARAMS
[
'hidden_units'
]]),
name
=
"hid_b"
)
# Variables of the softmax layer
sm_w
=
tf
.
Variable
(
tf
.
truncated_normal
(
[
PARAMS
[
'hidden_units'
],
10
],
stddev
=
1.0
/
math
.
sqrt
(
PARAMS
[
'hidden_units'
])),
name
=
"sm_w"
)
sm_b
=
tf
.
Variable
(
tf
.
zeros
([
10
]),
name
=
"sm_b"
)
# Ops: located on the worker specified with FLAGS.task_index
x
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
IMAGE_PIXELS
*
IMAGE_PIXELS
])
y_
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
10
])
hid_lin
=
tf
.
nn
.
xw_plus_b
(
x
,
hid_w
,
hid_b
)
hid
=
tf
.
nn
.
relu
(
hid_lin
)
y
=
tf
.
nn
.
softmax
(
tf
.
nn
.
xw_plus_b
(
hid
,
sm_w
,
sm_b
))
cross_entropy
=
-
tf
.
reduce_sum
(
y_
*
tf
.
log
(
tf
.
clip_by_value
(
y
,
1e-10
,
1.0
)))
opt
=
tf
.
train
.
AdamOptimizer
(
PARAMS
[
'learning_rate'
])
if
FLAGS
.
sync_replicas
:
if
FLAGS
.
replicas_to_aggregate
is
None
:
replicas_to_aggregate
=
num_workers
else
:
replicas_to_aggregate
=
FLAGS
.
replicas_to_aggregate
opt
=
tf
.
train
.
SyncReplicasOptimizer
(
opt
,
replicas_to_aggregate
=
replicas_to_aggregate
,
total_num_replicas
=
num_workers
,
name
=
"mnist_sync_replicas"
)
train_step
=
opt
.
minimize
(
cross_entropy
,
global_step
=
global_step
)
if
FLAGS
.
sync_replicas
:
local_init_op
=
opt
.
local_step_init_op
if
is_chief
:
local_init_op
=
opt
.
chief_init_op
ready_for_local_init_op
=
opt
.
ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner
=
opt
.
get_chief_queue_runner
()
sync_init_op
=
opt
.
get_init_tokens_op
()
init_op
=
tf
.
global_variables_initializer
()
train_dir
=
tempfile
.
mkdtemp
()
if
FLAGS
.
sync_replicas
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
local_init_op
=
local_init_op
,
ready_for_local_init_op
=
ready_for_local_init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
else
:
sv
=
tf
.
train
.
Supervisor
(
is_chief
=
is_chief
,
logdir
=
train_dir
,
init_op
=
init_op
,
recovery_wait_secs
=
1
,
global_step
=
global_step
)
sess_config
=
tf
.
ConfigProto
(
allow_soft_placement
=
True
,
log_device_placement
=
False
,
device_filters
=
[
"/job:ps"
,
"/job:worker/task:%d"
%
FLAGS
.
task_index
])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if
is_chief
:
print
(
"Worker %d: Initializing session..."
%
FLAGS
.
task_index
)
else
:
print
(
"Worker %d: Waiting for session to be initialized..."
%
FLAGS
.
task_index
)
if
FLAGS
.
existing_servers
:
server_grpc_url
=
"grpc://"
+
worker_spec
[
FLAGS
.
task_index
]
print
(
"Using existing server at: %s"
%
server_grpc_url
)
sess
=
sv
.
prepare_or_wait_for_session
(
server_grpc_url
,
config
=
sess_config
)
else
:
sess
=
sv
.
prepare_or_wait_for_session
(
server
.
target
,
config
=
sess_config
)
print
(
"Worker %d: Session initialization complete."
%
FLAGS
.
task_index
)
if
FLAGS
.
sync_replicas
and
is_chief
:
# Chief worker will start the chief queue runner and call the init op.
sess
.
run
(
sync_init_op
)
sv
.
start_queue_runners
(
sess
,
[
chief_queue_runner
])
# Perform training
time_begin
=
time
.
time
()
print
(
"Training begins @ %f"
%
time_begin
)
local_step
=
0
while
True
:
# Training feed
batch_xs
,
batch_ys
=
mnist
.
train
.
next_batch
(
PARAMS
[
'batch_size'
])
train_feed
=
{
x
:
batch_xs
,
y_
:
batch_ys
}
_
,
step
=
sess
.
run
([
train_step
,
global_step
],
feed_dict
=
train_feed
)
local_step
+=
1
now
=
time
.
time
()
print
(
"%f: Worker %d: training step %d done (global step: %d)"
%
(
now
,
FLAGS
.
task_index
,
local_step
,
step
))
if
step
>
0
and
step
%
5000
==
0
and
is_chief
:
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
interim_val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
step
,
interim_val_xent
))
# Only chief worker can report intermediate metrics
nni
.
report_intermediate_result
(
interim_val_xent
)
if
step
>=
FLAGS
.
train_steps
:
break
time_end
=
time
.
time
()
print
(
"Training ends @ %f"
%
time_end
)
training_time
=
time_end
-
time_begin
print
(
"Training elapsed time: %f s"
%
training_time
)
# Validation feed
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
interim_val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
step
,
interim_val_xent
))
# Only chief worker can report intermediate metrics
nni
.
report_intermediate_result
(
interim_val_xent
)
if
step
>=
FLAGS
.
train_steps
:
break
time_end
=
time
.
time
()
print
(
"Training ends @ %f"
%
time_end
)
training_time
=
time_end
-
time_begin
print
(
"Training elapsed time: %f s"
%
training_time
)
# Validation feed
val_feed
=
{
x
:
mnist
.
validation
.
images
,
y_
:
mnist
.
validation
.
labels
}
val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
FLAGS
.
train_steps
,
val_xent
))
val_xent
=
sess
.
run
(
cross_entropy
,
feed_dict
=
val_feed
)
print
(
"After %d training step(s), validation cross entropy = %g"
%
(
FLAGS
.
train_steps
,
val_xent
))
# Only chief worker can report final metrics
if
is_chief
:
nni
.
report_final_result
(
val_xent
)
# Only chief worker can report final metrics
if
is_chief
:
nni
.
report_final_result
(
val_xent
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
tf
.
app
.
run
()
examples/trials/mnist-hyperband/mnist.py
View file @
21165b53
...
...
@@ -3,8 +3,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -142,13 +143,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
examples/trials/mnist/mnist.py
View file @
21165b53
...
...
@@ -4,8 +4,9 @@ import argparse
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -143,13 +144,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
examples/trials/mnist/mnist_before.py
View file @
21165b53
...
...
@@ -3,8 +3,9 @@ import argparse
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
FLAGS
=
None
...
...
@@ -143,13 +144,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
View file @
21165b53
...
...
@@ -57,8 +57,8 @@ class CurvefittingAssessor(Assessor):
self
.
threshold
=
threshold
# Record the number of gap
self
.
gap
=
gap
# Record the number of
times of
judgment
s
self
.
judgment_num
=
0
# Record the number of
intermediate result in the lastest
judgment
self
.
last_
judgment_num
=
dict
()
# Record the best performance
self
.
set_best_performance
=
False
self
.
completed_best_performance
=
None
...
...
@@ -112,9 +112,10 @@ class CurvefittingAssessor(Assessor):
curr_step
=
len
(
trial_history
)
if
curr_step
<
self
.
start_step
:
return
AssessResult
.
Good
if
(
curr_step
-
self
.
start_step
)
//
self
.
gap
<=
self
.
judgment_num
:
if
trial_job_id
in
self
.
last_judgment_num
.
keys
()
and
curr_step
-
self
.
last_judgment_num
[
trial_job_id
]
<
self
.
gap
:
return
AssessResult
.
Good
self
.
judgment_num
=
(
curr_step
-
self
.
start_step
)
//
self
.
gap
self
.
last_
judgment_num
[
trial_job_id
]
=
curr_step
try
:
start_time
=
datetime
.
datetime
.
now
()
...
...
src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
View file @
21165b53
...
...
@@ -37,13 +37,13 @@ def _outlierDetection_threaded(inputs):
sys
.
stderr
.
write
(
"[%s] DEBUG: Evaluating %dth of %d samples
\n
"
\
%
(
os
.
path
.
basename
(
__file__
),
samples_idx
+
1
,
len
(
samples_x
)))
outlier
=
None
# Create a diagnostic regression model which removes the sample that we want to evaluate
diagnostic_regressor_gp
=
gp_create_model
.
create
M
odel
(
\
diagnostic_regressor_gp
=
gp_create_model
.
create
_m
odel
(
\
samples_x
[
0
:
samples_idx
]
+
samples_x
[
samples_idx
+
1
:],
\
samples_y_aggregation
[
0
:
samples_idx
]
+
samples_y_aggregation
[
samples_idx
+
1
:])
mu
,
sigma
=
gp_prediction
.
predict
(
samples_x
[
samples_idx
],
diagnostic_regressor_gp
[
'model'
])
# 2.33 is the z-score for 98% confidence level
if
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
>
(
2.33
*
sigma
):
outlier
=
{
"samples_idx"
:
samples_idx
,
...
...
@@ -51,26 +51,26 @@ def _outlierDetection_threaded(inputs):
"expected_sigma"
:
sigma
,
"difference"
:
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
-
(
2.33
*
sigma
)}
return
outlier
def
outlierDetection_threaded
(
samples_x
,
samples_y_aggregation
):
'''
'''
Use Multi-thread to detect the outlier
'''
outliers
=
[]
threads_inputs
=
[[
samples_idx
,
samples_x
,
samples_y_aggregation
]
\
for
samples_idx
in
range
(
0
,
len
(
samples_x
))]
threads_pool
=
ThreadPool
(
min
(
4
,
len
(
threads_inputs
)))
threads_results
=
threads_pool
.
map
(
_outlierDetection_threaded
,
threads_inputs
)
threads_pool
.
close
()
threads_pool
.
join
()
for
threads_result
in
threads_results
:
if
threads_result
is
not
None
:
outliers
.
append
(
threads_result
)
else
:
print
(
"error here."
)
outliers
=
None
if
len
(
outliers
)
==
0
else
outliers
return
outliers
...
...
@@ -79,21 +79,19 @@ def outlierDetection(samples_x, samples_y_aggregation):
'''
outliers
=
[]
for
samples_idx
in
range
(
0
,
len
(
samples_x
)):
#sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
#sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
# \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
diagnostic_regressor_gp
=
gp_create_model
.
create
M
odel
(
\
diagnostic_regressor_gp
=
gp_create_model
.
create
_m
odel
(
\
samples_x
[
0
:
samples_idx
]
+
samples_x
[
samples_idx
+
1
:],
\
samples_y_aggregation
[
0
:
samples_idx
]
+
samples_y_aggregation
[
samples_idx
+
1
:])
mu
,
sigma
=
gp_prediction
.
predict
(
samples_x
[
samples_idx
],
diagnostic_regressor_gp
[
'model'
])
diagnostic_regressor_gp
[
'model'
])
# 2.33 is the z-score for 98% confidence level
if
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
>
(
2.33
*
sigma
):
outliers
.
append
({
"samples_idx"
:
samples_idx
,
"expected_mu"
:
mu
,
"expected_sigma"
:
sigma
,
"difference"
:
abs
(
samples_y_aggregation
[
samples_idx
]
-
mu
)
-
(
2.33
*
sigma
)})
outliers
=
None
if
len
(
outliers
)
==
0
else
outliers
return
outliers
\ No newline at end of file
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
View file @
21165b53
...
...
@@ -24,22 +24,20 @@ import os
import
random
import
statistics
import
sys
import
numpy
as
np
from
enum
import
Enum
,
unique
from
multiprocessing.dummy
import
Pool
as
ThreadPool
from
nni.tuner
import
Tuner
import
numpy
as
np
import
nni.metis_tuner.lib_data
as
lib_data
import
nni.metis_tuner.lib_constraint_summation
as
lib_constraint_summation
import
nni.metis_tuner.Regression_GP.CreateModel
as
gp_create_model
import
nni.metis_tuner.Regression_GP.Selection
as
gp_selection
import
nni.metis_tuner.Regression_GP.Prediction
as
gp_prediction
import
nni.metis_tuner.Regression_GP.OutlierDetection
as
gp_outlier_detection
import
nni.metis_tuner.lib_data
as
lib_data
import
nni.metis_tuner.Regression_GMM.CreateModel
as
gmm_create_model
import
nni.metis_tuner.Regression_GMM.Selection
as
gmm_selection
import
nni.metis_tuner.Regression_GP.CreateModel
as
gp_create_model
import
nni.metis_tuner.Regression_GP.OutlierDetection
as
gp_outlier_detection
import
nni.metis_tuner.Regression_GP.Prediction
as
gp_prediction
import
nni.metis_tuner.Regression_GP.Selection
as
gp_selection
from
nni.tuner
import
Tuner
logger
=
logging
.
getLogger
(
"Metis_Tuner_AutoML"
)
...
...
@@ -67,33 +65,37 @@ class MetisTuner(Tuner):
"""
def
__init__
(
self
,
optimize_mode
=
"maximize"
,
no_resampling
=
True
,
no_candidates
=
True
,
selection_num_starting_points
=
1
0
,
cold_start_num
=
10
):
selection_num_starting_points
=
60
0
,
cold_start_num
=
10
,
exploration_probability
=
0.1
):
"""
Parameters
----------
optimize_mode : str
optimize_mode is a string that including two mode "maximize" and "minimize"
no_resampling : bool
True or False. Should Metis consider re-sampling as part of the search strategy?
If you are confident that the training dataset is noise-free, then you do not need re-sampling.
no_candidates: bool
True or False. Should Metis suggest parameters for the next benchmark?
If you do not plan to do more benchmarks, Metis can skip this step.
selection_num_starting_points: int
how many times Metis should try to find the global optimal in the search space?
The higher the number, the longer it takes to output the solution.
cold_start_num: int
Metis need some trial result to get cold start. when the number of trial result is less than
cold_start_num, Metis will randomly sample hyper-parameter for trial.
exploration_probability: float
The probability of Metis to select parameter from exploration instead of exploitation.
"""
self
.
samples_x
=
[]
self
.
samples_y
=
[]
self
.
samples_y_aggregation
=
[]
self
.
history_parameters
=
[]
self
.
space
=
None
self
.
no_resampling
=
no_resampling
self
.
no_candidates
=
no_candidates
...
...
@@ -101,6 +103,7 @@ class MetisTuner(Tuner):
self
.
key_order
=
[]
self
.
cold_start_num
=
cold_start_num
self
.
selection_num_starting_points
=
selection_num_starting_points
self
.
exploration_probability
=
exploration_probability
self
.
minimize_constraints_fun
=
None
self
.
minimize_starting_points
=
None
...
...
@@ -128,7 +131,7 @@ class MetisTuner(Tuner):
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
RuntimeError
(
"The format search space contains
\
some key that didn't define in key_order."
)
some key that didn't define in key_order."
)
if
key_type
==
'quniform'
:
if
key_range
[
2
]
==
1
:
...
...
@@ -191,7 +194,7 @@ class MetisTuner(Tuner):
Parameters
----------
parameter_id : int
Returns
-------
result : dict
...
...
@@ -200,13 +203,15 @@ class MetisTuner(Tuner):
init_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
results
=
self
.
_pack_output
(
init_parameter
)
else
:
self
.
minimize_starting_points
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
\
self
.
selection_num_starting_points
)
results
=
self
.
_selection
(
self
.
samples_x
,
self
.
samples_y_aggregation
,
self
.
samples_y
,
self
.
x_bounds
,
self
.
x_types
,
threshold_samplessize_resampling
=
(
None
if
self
.
no_resampling
is
True
else
50
),
no_candidates
=
self
.
no_candidates
,
minimize_starting_points
=
self
.
minimize_starting_points
,
minimize_constraints_fun
=
self
.
minimize_constraints_fun
)
logger
.
info
(
"Generate paramageters:
\n
"
+
str
(
results
))
return
results
...
...
@@ -245,7 +250,7 @@ class MetisTuner(Tuner):
# calculate y aggregation
median
=
get_median
(
temp_y
)
self
.
samples_y_aggregation
[
idx
]
=
median
self
.
samples_y_aggregation
[
idx
]
=
[
median
]
else
:
self
.
samples_x
.
append
(
sample_x
)
self
.
samples_y
.
append
([
value
])
...
...
@@ -264,17 +269,21 @@ class MetisTuner(Tuner):
candidates
=
[]
samples_size_all
=
sum
([
len
(
i
)
for
i
in
samples_y
])
samples_size_unique
=
len
(
samples_y
)
# ===== STEP 1: Compute the current optimum =====
#sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
gp_model
=
gp_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
lm_current
=
gp_selection
.
selection
(
"lm"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
lm_current
=
gp_selection
.
selection
(
"lm"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
not
lm_current
:
return
None
if
no_candidates
is
False
:
candidates
.
append
({
'hyperparameter'
:
lm_current
[
'hyperparameter'
],
'expected_mu'
:
lm_current
[
'expected_mu'
],
...
...
@@ -284,10 +293,14 @@ class MetisTuner(Tuner):
# ===== STEP 2: Get recommended configurations for exploration =====
#sys.stderr.write("[%s] Getting candidates for exploration...\n"
#% \(os.path.basename(__file__)))
results_exploration
=
gp_selection
.
selection
(
"lc"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
results_exploration
=
gp_selection
.
selection
(
"lc"
,
samples_y_aggregation
,
x_bounds
,
x_types
,
gp_model
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
results_exploration
is
not
None
:
if
_num_past_samples
(
results_exploration
[
'hyperparameter'
],
samples_x
,
samples_y
)
==
0
:
...
...
@@ -308,12 +321,13 @@ class MetisTuner(Tuner):
print
(
"Getting candidates for exploitation...
\n
"
)
try
:
gmm
=
gmm_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
results_exploitation
=
gmm_selection
.
selection
(
x_bounds
,
x_types
,
gmm
[
'clusteringmodel_good'
],
gmm
[
'clusteringmodel_bad'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
results_exploitation
=
gmm_selection
.
selection
(
x_bounds
,
x_types
,
gmm
[
'clusteringmodel_good'
],
gmm
[
'clusteringmodel_bad'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
results_exploitation
is
not
None
:
if
_num_past_samples
(
results_exploitation
[
'hyperparameter'
],
samples_x
,
samples_y
)
==
0
:
...
...
@@ -326,9 +340,9 @@ class MetisTuner(Tuner):
logger
.
info
(
"DEBUG: No suitable exploitation_gmm candidates were found
\n
"
)
except
ValueError
as
exception
:
# The exception: ValueError: Fitting the mixture model failed
# because some components have ill-defined empirical covariance
# (for instance caused by singleton or collapsed samples).
# The exception: ValueError: Fitting the mixture model failed
# because some components have ill-defined empirical covariance
# (for instance caused by singleton or collapsed samples).
# Try to decrease the number of components, or increase reg_covar.
logger
.
info
(
"DEBUG: No suitable exploitation_gmm candidates were found due to exception."
)
logger
.
info
(
exception
)
...
...
@@ -340,8 +354,6 @@ class MetisTuner(Tuner):
results_outliers
=
gp_outlier_detection
.
outlierDetection_threaded
(
samples_x
,
samples_y_aggregation
)
if
results_outliers
is
not
None
:
#temp = len(candidates)
for
results_outlier
in
results_outliers
:
if
_num_past_samples
(
samples_x
[
results_outlier
[
'samples_idx'
]],
samples_x
,
samples_y
)
<
max_resampling_per_x
:
candidates
.
append
({
'hyperparameter'
:
samples_x
[
results_outlier
[
'samples_idx'
]],
\
...
...
@@ -357,7 +369,10 @@ class MetisTuner(Tuner):
logger
.
info
(
"Evaluating information gain of %d candidates...
\n
"
)
next_improvement
=
0
threads_inputs
=
[[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
for
candidate
in
candidates
]
threads_inputs
=
[[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
for
candidate
in
candidates
]
threads_pool
=
ThreadPool
(
4
)
# Evaluate what would happen if we actually sample each candidate
threads_results
=
threads_pool
.
map
(
_calculate_lowest_mu_threaded
,
threads_inputs
)
...
...
@@ -368,21 +383,23 @@ class MetisTuner(Tuner):
if
threads_result
[
'expected_lowest_mu'
]
<
lm_current
[
'expected_mu'
]:
# Information gain
temp_improvement
=
threads_result
[
'expected_lowest_mu'
]
-
lm_current
[
'expected_mu'
]
if
next_improvement
>
temp_improvement
:
#
logger.info("DEBUG: \"next_candidate\" changed: \
#
lowest mu might reduce from %f (%s) to %f (%s), %s\n" %\
#
lm_current['expected_mu'], str(lm_current['hyperparameter']),\
#
threads_result['expected_lowest_mu'],\
#
str(threads_result['candidate']['hyperparameter']),\
#
threads_result['candidate']['reason'])
logger
.
info
(
"DEBUG:
\"
next_candidate
\"
changed:
\
lowest mu might reduce from %f (%s) to %f (%s), %s
\n
"
%
\
lm_current
[
'expected_mu'
],
str
(
lm_current
[
'hyperparameter'
]),
\
threads_result
[
'expected_lowest_mu'
],
\
str
(
threads_result
[
'candidate'
][
'hyperparameter'
]),
\
threads_result
[
'candidate'
][
'reason'
])
next_improvement
=
temp_improvement
next_candidate
=
threads_result
[
'candidate'
]
else
:
# ===== STEP 6: If we have no candidates, randomly pick one =====
logger
.
info
(
"DEBUG: No candidates from exploration, exploitation,
\
and resampling. We will random a candidate for next_candidate
\n
"
)
logger
.
info
(
"DEBUG: No candidates from exploration, exploitation,
\
and resampling. We will random a candidate for next_candidate
\n
"
)
next_candidate
=
_rand_with_constraints
(
x_bounds
,
x_types
)
\
if
minimize_starting_points
is
None
else
minimize_starting_points
[
0
]
...
...
@@ -391,7 +408,16 @@ class MetisTuner(Tuner):
next_candidate
=
{
'hyperparameter'
:
next_candidate
,
'reason'
:
"random"
,
'expected_mu'
:
expected_mu
,
'expected_sigma'
:
expected_sigma
}
# ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step =====
outputs
=
self
.
_pack_output
(
lm_current
[
'hyperparameter'
])
ap
=
random
.
uniform
(
0
,
1
)
if
outputs
in
self
.
history_parameters
or
ap
<=
self
.
exploration_probability
:
if
next_candidate
is
not
None
:
outputs
=
self
.
_pack_output
(
next_candidate
[
'hyperparameter'
])
else
:
random_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
outputs
=
self
.
_pack_output
(
random_parameter
)
self
.
history_parameters
.
append
(
outputs
)
return
outputs
...
...
@@ -437,10 +463,14 @@ def _calculate_lowest_mu_threaded(inputs):
# Aggregates multiple observation of the sample sampling points
temp_y_aggregation
=
[
statistics
.
median
(
temp_sample_y
)
for
temp_sample_y
in
temp_samples_y
]
temp_gp
=
gp_create_model
.
create_model
(
temp_samples_x
,
temp_y_aggregation
)
temp_results
=
gp_selection
.
selection
(
"lm"
,
temp_y_aggregation
,
x_bounds
,
x_types
,
temp_gp
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
temp_results
=
gp_selection
.
selection
(
"lm"
,
temp_y_aggregation
,
x_bounds
,
x_types
,
temp_gp
[
'model'
],
minimize_starting_points
,
minimize_constraints_fun
=
minimize_constraints_fun
)
if
outputs
[
"expected_lowest_mu"
]
is
None
or
outputs
[
"expected_lowest_mu"
]
>
temp_results
[
'expected_mu'
]:
outputs
[
"expected_lowest_mu"
]
=
temp_results
[
'expected_mu'
]
...
...
src/webui/src/components/TrialsDetail.tsx
View file @
21165b53
...
...
@@ -92,7 +92,7 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
formatter
:
function
(
data
:
TooltipForAccuracy
)
{
const
result
=
'
<div class="tooldetailAccuracy">
'
+
'
<div>Trial No:
'
+
data
.
data
[
0
]
+
'
</div>
'
+
'
<div>Default Metrc:
'
+
data
.
data
[
1
]
+
'
</div>
'
+
'
<div>Default Metr
i
c:
'
+
data
.
data
[
1
]
+
'
</div>
'
+
'
<div>Parameters:
'
+
'
<pre>
'
+
JSON
.
stringify
(
data
.
data
[
2
],
null
,
4
)
+
'
</pre>
'
+
'
</div>
'
+
...
...
test/config_test/examples/mnist-annotation.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
2
trialConcurrency
:
1
maxTrialNum
:
4
trialConcurrency
:
2
tuner
:
builtinTunerName
:
Random
...
...
test/config_test/examples/mnist.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
2
trialConcurrency
:
1
maxTrialNum
:
4
trialConcurrency
:
2
searchSpacePath
:
./mnist_search_space.json
tuner
:
...
...
test/config_test/multi_phase/multi_phase.test.yml
View file @
21165b53
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
5m
maxTrialNum
:
16
trialConcurrency
:
8
maxTrialNum
:
8
trialConcurrency
:
4
searchSpacePath
:
./search_space.json
tuner
:
...
...
test/pipelines-it-kubeflow.yml
View file @
21165b53
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jobs
:
-
job
:
'
integration_test_kubeflow'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
KUBE
CLI'
variables
:
new_docker_img
:
msranni/nni.it.kb:latest
steps
:
-
script
:
python3 -m pip install --upgrade pip setuptools --user
...
...
@@ -18,20 +34,6 @@ jobs:
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
nni
bdsit_wheel'
-
script
:
|
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
and
upload
nni
docker
image'
-
script
:
|
source install.sh
displayName
:
'
Install
nni
toolkit
via
source
code'
...
...
@@ -39,7 +41,18 @@ jobs:
-
script
:
|
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
...
...
test/pipelines-it-pai.yml
View file @
21165b53
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
jobs
:
-
job
:
'
integration_test_pai'
timeoutInMinutes
:
0
pool
:
'
NNI
CI
PAI
CLI'
variables
:
new_docker_img
:
msranni/nni.it.pai:latest
steps
:
-
script
:
python3 -m pip install --upgrade pip setuptools --user
...
...
@@ -18,20 +34,6 @@ jobs:
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
nni
bdsit_wheel'
-
script
:
|
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition
:
eq( variables['build_docker_img'], 'true' )
displayName
:
'
build
and
upload
nni
docker
image'
-
script
:
|
source install.sh
displayName
:
'
Install
nni
toolkit
via
source
code'
...
...
@@ -39,10 +41,24 @@ jobs:
-
script
:
|
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
echo 'build and upload docker image'
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
...
...
tools/nni_annotation/README_zh_CN.md
View file @
21165b53
# NNI Annotation
介绍
# NNI Annotation
为了获得良好的用户体验并减少用户负担,NNI 设计了通过注释来使用的语法。
## 概述
使用 NNI 时,只需要:
为了获得良好的用户体验并减少对以后代码的影响,NNI 设计了通过 Annotation(标记)来使用的语法。 通过 Annotation,只需要在代码中加入一些注释字符串,就能启用 NNI,完全不影响代码原先的执行逻辑。
1.
在超参变量前加上如下标记:
'''@nni.variable(nni.choice(2,3,5,7),name=self.conv_size)'''
样例如下:
2.
在中间结果前加上:
'''@nni.report_intermediate_result(test_acc)'''
```
python
'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
learning_rate
=
0.1
```
3.
在输出结果前加上:
'''@nni.report_final_result(test_acc)'''
此样例中,NNI 会从 (0.1, 0.01, 0.001) 中选择一个值赋给 learning_rate 变量。 第一行就是 NNI 的 Annotation,是 Python 中的一个字符串。 接下来的一行需要是赋值语句。 NNI 会根据 Annotation 行的信息,来给这一行的变量赋上相应的值。
4.
在代码中使用函数
`function_choice`
:
'''@nni.function_choice(max_pool(h_conv1, self.pool_size),avg_pool(h_conv1, self.pool_size),name=max_pool)'''
通过这种方式,不需要修改任何代码,代码既可以直接运行,又可以使用 NNI 来调参。
通过这种方法,能够轻松的在 NNI 中实现自动调参。
## Annotation 的类型:
`@nni.variable`
,
`nni.choice`
为搜索空间的类型,通过以下 10 种方法来定义搜索空间:
NNI 中,有 4 种类型的 Annotation;
1.
`@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)`
变量值是选项中的一种,这些变量可以是任意的表达式。
### 1. 变量
2.
`@nni.variable(nni.randint(upper),name=variable)`
变量可以是范围 [0, upper) 中的任意整数。
`'''@nni.variable(sampling_algo, name)'''`
3.
`@nni.variable(nni.uniform(low, high),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值。
`@nni.variable`
用来标记变量。
4.
`@nni.variable(nni.quniform(low, high, q),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(uniform(low, high) / q)
*
q
**参数**
5.
`@nni.variable(nni.loguniform(low, high),name=variable)`
变量值是 exp(uniform(low, high)) 的点,数值以对数均匀分布
。
-
**sampling_algo**
: 指定搜索空间的采样算法。 可将其换成 NNI 支持的其它采样函数,函数要以
`nni.`
开头。例如,
`choice`
或
`uniform`
,详见
[
SearchSpaceSpec
](
https://nni.readthedocs.io/zh/latest/SearchSpaceSpec.html
)
。
-
**name**
: 将被赋值的变量名称。 注意,此参数应该与下面一行等号左边的值相同
。
6.
`@nni.variable(nni.qloguniform(low, high, q),name=variable)`
变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(exp(uniform(low, high)) / q)
*
q
NNI 支持如下 10 种类型来表示搜索空间:
7.
`@nni.variable(nni.normal(label, mu, sigma),name=variable)`
变量值为正态分布的实数值,平均值为 mu,标准方差为 sigma。
-
`@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)`
变量值是选项中的一种,这些变量可以是任意的表达式。
-
`@nni.variable(nni.randint(upper),name=variable)`
变量可以是范围
[
0, upper) 中的任意整数。
- `@nni.variable(nni.uniform(low, high),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值。
- `@nni.variable(nni.quniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(uniform(low, high) / q) * q
- `@nni.variable(nni.loguniform(low, high),name=variable)` 变量值是 exp(uniform(low, high)) 的点,数值以对数均匀分布。
- `@nni.variable(nni.qloguniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值,公式为:round(exp(uniform(low, high)) / q) * q
- `@nni.variable(nni.normal(mu, sigma),name=variable)` 变量值为正态分布的实数值,平均值为 mu,标准方差为 sigma。
- `@nni.variable(nni.qnormal(mu, sigma, q),name=variable)` 变量值分布的公式为: round(normal(mu, sigma) / q) * q
- `@nni.variable(nni.lognormal(mu, sigma),name=variable)` 变量值分布的公式为: exp(normal(mu, sigma))
- `@nni.variable(nni.qlognormal(mu, sigma, q),name=variable)` 变量值分布的公式为: round(exp(normal(mu, sigma)) / q) * q
8.
`@nni.variable(nni.qnormal(label, mu, sigma, q),name=variable)`
变量值分布的公式为: round(normal(mu, sigma) / q)
*
q
样例如下:
9.
`@nni.variable(nni.lognormal(label, mu, sigma),name=variable)`
变量值分布的公式为: exp(normal(mu, sigma))
```python
'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
learning_rate = 0.1
```
10.
`@nni.variable(nni.qlognormal(label, mu, sigma, q),name=variable)`
变量值分布的公式为: round(exp(normal(mu, sigma)) / q)
*
q
\ No newline at end of file
### 2. 函数
`'''@nni.function_choice(*functions, name)'''`
`@nni.function_choice` 可以从几个函数中选择一个来执行。
**参数**
- **functions**: 可选择的函数。 注意,必须是包括参数的完整函数调用。 例如 `max_pool(hidden_layer, pool_size)`。
- **name**: 将被替换的函数名称。
例如:
```python
"""@nni.function_choice(max_pool(hidden_layer, pool_size), avg_pool(hidden_layer, pool_size), name=max_pool)"""
h_pooling = max_pool(hidden_layer, pool_size)
```
### 3. 中间结果
`'''@nni.report_intermediate_result(metrics)'''`
`@nni.report_intermediate_result` 用来返回中间结果,这和 [Trials.md
](
https://nni.readthedocs.io/zh/latest/Trials.html
)
中的
`nni.report_intermediate_result`
用法一样。
### 4. 最终结果
`'''@nni.report_final_result(metrics)'''`
`@nni.report_final_result`
用来返回当前 Trial 的最终结果,这和
[
Trials.md
](
https://nni.readthedocs.io/zh/latest/Trials.html
)
中的
`nni.report_final_result`
用法一样。
\ No newline at end of file
tools/nni_annotation/examples/mnist_generated.py
View file @
21165b53
import
nni
"""A deep MNIST classifier using convolutional layers."""
import
logging
import
math
import
tempfile
import
time
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
FLAGS
=
None
logger
=
logging
.
getLogger
(
'mnist_AutoML'
)
...
...
@@ -123,12 +127,23 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
"""
Main function, build mnist network, run and send result to NNI.
"""
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
],
one_hot
=
True
)
def
main
(
params
):
# Import data
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
mnist_network
=
MnistNetwork
(
channel_1_num
=
params
[
'channel_1_num'
],
...
...
tools/nni_annotation/examples/mnist_with_annotation.py
View file @
21165b53
...
...
@@ -21,8 +21,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
FLAGS
=
None
...
...
@@ -168,13 +169,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
tools/nni_annotation/examples/mnist_without_annotation.py
View file @
21165b53
...
...
@@ -21,8 +21,9 @@
import
logging
import
math
import
tempfile
import
t
ensorflow
as
tf
import
t
ime
import
tensorflow
as
tf
from
tensorflow.examples.tutorials.mnist
import
input_data
import
nni
...
...
@@ -172,13 +173,21 @@ def bias_variable(shape):
initial
=
tf
.
constant
(
0.1
,
shape
=
shape
)
return
tf
.
Variable
(
initial
)
def
download_mnist_retry
(
data_dir
,
max_num_retries
=
20
):
"""Try to download mnist dataset and avoid errors"""
for
_
in
range
(
max_num_retries
):
try
:
return
input_data
.
read_data_sets
(
data_dir
,
one_hot
=
True
)
except
tf
.
errors
.
AlreadyExistsError
:
time
.
sleep
(
1
)
raise
Exception
(
"Failed to download MNIST."
)
def
main
(
params
):
'''
Main function, build mnist network, run and send result to NNI.
'''
# Import data
mnist
=
input_data
.
read_data_sets
(
params
[
'data_dir'
]
,
one_hot
=
True
)
mnist
=
download_mnist_retry
(
params
[
'data_dir'
])
print
(
'Mnist download data done.'
)
logger
.
debug
(
'Mnist download data done.'
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment