Merge pull request #138 from Microsoft/master

merge master

Merge pull request #138 from Microsoft/master
merge master
21165b53 · SparkSnail · GitHub · 41a9a598 · f10c3311 · 21165b53
Unverified Commit 21165b53 authored Mar 07, 2019 by SparkSnail Committed by GitHub Mar 07, 2019
17 changed files
--- a/examples/trials/mnist-distributed/dist_mnist.py
+++ b/examples/trials/mnist-distributed/dist_mnist.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 # ==============================================================================

-# 
-# NNI (https://github.com/Microsoft/nni) modified this code to show how to 
+#
+# NNI (https://github.com/Microsoft/nni) modified this code to show how to
 # integrate distributed tensorflow training with NNI SDK
-# 
+#
 """Distributed MNIST training and validation, with model replicas.

 A simple softmax model with one hidden layer is defined. The parameters
@@ -54,19 +54,22 @@ import nni
 flags = tf.app.flags
 flags.DEFINE_string("data_dir", "/tmp/mnist-data",
                    "Directory for storing mnist data")
-flags.DEFINE_boolean("download_only", False,
-                     "Only perform downloading of data; Do not proceed to "
-                     "session preparation, model definition or training")
-flags.DEFINE_integer("task_index", None,
-                     "Worker task index, should be >= 0. task_index=0 is "
-                     "the master worker task the performs the variable "
-                     "initialization ")
-flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
-                     "If you don't use GPU, please set it to '0'")
-flags.DEFINE_integer("replicas_to_aggregate", None,
-                     "Number of replicas to aggregate before parameter update"
-                     "is applied (For sync_replicas mode only; default: "
-                     "num_workers)")
+flags.DEFINE_boolean(
+    "download_only", False,
+    "Only perform downloading of data; Do not proceed to "
+    "session preparation, model definition or training")
+flags.DEFINE_integer(
+    "task_index", None, "Worker task index, should be >= 0. task_index=0 is "
+    "the master worker task the performs the variable "
+    "initialization ")
+flags.DEFINE_integer(
+    "num_gpus", 1, "Total number of gpus for each machine."
+    "If you don't use GPU, please set it to '0'")
+flags.DEFINE_integer(
+    "replicas_to_aggregate", None,
+    "Number of replicas to aggregate before parameter update"
+    "is applied (For sync_replicas mode only; default: "
+    "num_workers)")
 flags.DEFINE_integer("train_steps", 20000,
                     "Number of (global) training steps to perform")
 flags.DEFINE_boolean(
@@ -96,237 +99,256 @@ IMAGE_PIXELS = 28
 #       {'cluster': cluster,
 #        'task': {'type': 'worker', 'index': 1}})

+
 def generate_default_params():
-  '''
-  Generate default hyper parameters
-  '''
-  return {
-    'learning_rate': 0.01,
-    'batch_size': 100,
-    'hidden_units': 100,
-  }
+    '''
+    Generate default hyper parameters
+    '''
+    return {
+        'learning_rate': 0.01,
+        'batch_size': 100,
+        'hidden_units': 100,
+    }
+
+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(unused_argv):
-  # Receive NNI hyper parameter and update it onto default params
-  RECEIVED_PARAMS = nni.get_next_parameter()
-  PARAMS = generate_default_params()
-  PARAMS.update(RECEIVED_PARAMS)
-
-  # Parse environment variable TF_CONFIG to get job_name and task_index
-
-  # If not explicitly specified in the constructor and the TF_CONFIG
-  # environment variable is present, load cluster_spec from TF_CONFIG.
-  tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}')
-  task_config = tf_config.get('task', {})
-  task_type = task_config.get('type')
-  task_index = task_config.get('index')
-
-  FLAGS.job_name = task_type
-  FLAGS.task_index = task_index
-
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
-  if FLAGS.download_only:
-    sys.exit(0)
-
-  if FLAGS.job_name is None or FLAGS.job_name == "":
-    raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index == "":
-    raise ValueError("Must specify an explicit `task_index`")
-
-  print("job name = %s" % FLAGS.job_name)
-  print("task index = %d" % FLAGS.task_index)
-
-  cluster_config = tf_config.get('cluster', {})
-  ps_hosts = cluster_config.get('ps')
-  worker_hosts = cluster_config.get('worker')
-
-  ps_hosts_str = ','.join(ps_hosts)
-  worker_hosts_str = ','.join(worker_hosts)
-
-  FLAGS.ps_hosts = ps_hosts_str
-  FLAGS.worker_hosts = worker_hosts_str
-
-  # Construct the cluster and start the server
-  ps_spec = FLAGS.ps_hosts.split(",")
-  worker_spec = FLAGS.worker_hosts.split(",")
-
-  # Get the number of workers.
-  num_workers = len(worker_spec)
-
-  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
-
-  if not FLAGS.existing_servers:
-    # Not using existing servers. Create an in-process server.
-    server = tf.train.Server(
-        cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
-    if FLAGS.job_name == "ps":
-      server.join()
-
-  is_chief = (FLAGS.task_index == 0)
-  if FLAGS.num_gpus > 0:
-    # Avoid gpu allocation conflict: now allocate task_num -> #gpu
-    # for each worker in the corresponding machine
-    gpu = (FLAGS.task_index % FLAGS.num_gpus)
-    worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
-  elif FLAGS.num_gpus == 0:
-    # Just allocate the CPU to worker server
-    cpu = 0
-    worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
-  # The device setter will automatically place Variables ops on separate
-  # parameter servers (ps). The non-Variable ops will be placed on the workers.
-  # The ps use CPU and workers use corresponding GPU
-  with tf.device(
-      tf.train.replica_device_setter(
-          worker_device=worker_device,
-          ps_device="/job:ps/cpu:0",
-          cluster=cluster)):
-    global_step = tf.Variable(0, name="global_step", trainable=False)
-
-    # Variables of the hidden layer
-    hid_w = tf.Variable(
-        tf.truncated_normal(
-            [IMAGE_PIXELS * IMAGE_PIXELS, PARAMS['hidden_units']],
-            stddev=1.0 / IMAGE_PIXELS),
-        name="hid_w")
-    hid_b = tf.Variable(tf.zeros([PARAMS['hidden_units']]), name="hid_b")
-
-    # Variables of the softmax layer
-    sm_w = tf.Variable(
-        tf.truncated_normal(
-            [PARAMS['hidden_units'], 10],
-            stddev=1.0 / math.sqrt(PARAMS['hidden_units'])),
-        name="sm_w")
-    sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
-
-    # Ops: located on the worker specified with FLAGS.task_index
-    x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
-    y_ = tf.placeholder(tf.float32, [None, 10])
-
-    hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
-    hid = tf.nn.relu(hid_lin)
-
-    y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
-    cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
-
-    opt = tf.train.AdamOptimizer(PARAMS['learning_rate'])
-
-    if FLAGS.sync_replicas:
-      if FLAGS.replicas_to_aggregate is None:
-        replicas_to_aggregate = num_workers
-      else:
-        replicas_to_aggregate = FLAGS.replicas_to_aggregate
-
-      opt = tf.train.SyncReplicasOptimizer(
-          opt,
-          replicas_to_aggregate=replicas_to_aggregate,
-          total_num_replicas=num_workers,
-          name="mnist_sync_replicas")
-
-    train_step = opt.minimize(cross_entropy, global_step=global_step)
-
-    if FLAGS.sync_replicas:
-      local_init_op = opt.local_step_init_op
-      if is_chief:
-        local_init_op = opt.chief_init_op
-
-      ready_for_local_init_op = opt.ready_for_local_init_op
-
-      # Initial token and chief queue runners required by the sync_replicas mode
-      chief_queue_runner = opt.get_chief_queue_runner()
-      sync_init_op = opt.get_init_tokens_op()
-
-    init_op = tf.global_variables_initializer()
-    train_dir = tempfile.mkdtemp()
-
-    if FLAGS.sync_replicas:
-      sv = tf.train.Supervisor(
-          is_chief=is_chief,
-          logdir=train_dir,
-          init_op=init_op,
-          local_init_op=local_init_op,
-          ready_for_local_init_op=ready_for_local_init_op,
-          recovery_wait_secs=1,
-          global_step=global_step)
-    else:
-      sv = tf.train.Supervisor(
-          is_chief=is_chief,
-          logdir=train_dir,
-          init_op=init_op,
-          recovery_wait_secs=1,
-          global_step=global_step)
-
-    sess_config = tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=False,
-        device_filters=["/job:ps",
-                        "/job:worker/task:%d" % FLAGS.task_index])
-
-    # The chief worker (task_index==0) session will prepare the session,
-    # while the remaining workers will wait for the preparation to complete.
-    if is_chief:
-      print("Worker %d: Initializing session..." % FLAGS.task_index)
-    else:
-      print("Worker %d: Waiting for session to be initialized..." %
-            FLAGS.task_index)
-
-    if FLAGS.existing_servers:
-      server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
-      print("Using existing server at: %s" % server_grpc_url)
-
-      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
-    else:
-      sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
-
-    print("Worker %d: Session initialization complete." % FLAGS.task_index)
-
-    if FLAGS.sync_replicas and is_chief:
-      # Chief worker will start the chief queue runner and call the init op.
-      sess.run(sync_init_op)
-      sv.start_queue_runners(sess, [chief_queue_runner])
-
-    # Perform training
-    time_begin = time.time()
-    print("Training begins @ %f" % time_begin)
-
-    local_step = 0
-    while True:
-      # Training feed
-      batch_xs, batch_ys = mnist.train.next_batch(PARAMS['batch_size'])
-      train_feed = {x: batch_xs, y_: batch_ys}
-
-      _, step = sess.run([train_step, global_step], feed_dict=train_feed)
-      local_step += 1
-
-      now = time.time()
-      print("%f: Worker %d: training step %d done (global step: %d)" %
-            (now, FLAGS.task_index, local_step, step))
-
-      if step > 0 and step % 5000 == 0 and is_chief:
+    # Receive NNI hyper parameter and update it onto default params
+    RECEIVED_PARAMS = nni.get_next_parameter()
+    PARAMS = generate_default_params()
+    PARAMS.update(RECEIVED_PARAMS)
+
+    # Parse environment variable TF_CONFIG to get job_name and task_index
+
+    # If not explicitly specified in the constructor and the TF_CONFIG
+    # environment variable is present, load cluster_spec from TF_CONFIG.
+    tf_config = json.loads(os.environ.get('TF_CONFIG') or '{}')
+    task_config = tf_config.get('task', {})
+    task_type = task_config.get('type')
+    task_index = task_config.get('index')
+
+    FLAGS.job_name = task_type
+    FLAGS.task_index = task_index
+
+    mnist = download_mnist_retry(FLAGS.data_dir)
+    if FLAGS.download_only:
+        sys.exit(0)
+
+    if FLAGS.job_name is None or FLAGS.job_name == "":
+        raise ValueError("Must specify an explicit `job_name`")
+    if FLAGS.task_index is None or FLAGS.task_index == "":
+        raise ValueError("Must specify an explicit `task_index`")
+
+    print("job name = %s" % FLAGS.job_name)
+    print("task index = %d" % FLAGS.task_index)
+
+    cluster_config = tf_config.get('cluster', {})
+    ps_hosts = cluster_config.get('ps')
+    worker_hosts = cluster_config.get('worker')
+
+    ps_hosts_str = ','.join(ps_hosts)
+    worker_hosts_str = ','.join(worker_hosts)
+
+    FLAGS.ps_hosts = ps_hosts_str
+    FLAGS.worker_hosts = worker_hosts_str
+
+    # Construct the cluster and start the server
+    ps_spec = FLAGS.ps_hosts.split(",")
+    worker_spec = FLAGS.worker_hosts.split(",")
+
+    # Get the number of workers.
+    num_workers = len(worker_spec)
+
+    cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
+
+    if not FLAGS.existing_servers:
+        # Not using existing servers. Create an in-process server.
+        server = tf.train.Server(
+            cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
+        if FLAGS.job_name == "ps":
+            server.join()
+
+    is_chief = (FLAGS.task_index == 0)
+    if FLAGS.num_gpus > 0:
+        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
+        # for each worker in the corresponding machine
+        gpu = (FLAGS.task_index % FLAGS.num_gpus)
+        worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
+    elif FLAGS.num_gpus == 0:
+        # Just allocate the CPU to worker server
+        cpu = 0
+        worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
+    # The device setter will automatically place Variables ops on separate
+    # parameter servers (ps). The non-Variable ops will be placed on the workers.
+    # The ps use CPU and workers use corresponding GPU
+    with tf.device(
+            tf.train.replica_device_setter(
+                worker_device=worker_device,
+                ps_device="/job:ps/cpu:0",
+                cluster=cluster)):
+        global_step = tf.Variable(0, name="global_step", trainable=False)
+
+        # Variables of the hidden layer
+        hid_w = tf.Variable(
+            tf.truncated_normal(
+                [IMAGE_PIXELS * IMAGE_PIXELS, PARAMS['hidden_units']],
+                stddev=1.0 / IMAGE_PIXELS),
+            name="hid_w")
+        hid_b = tf.Variable(tf.zeros([PARAMS['hidden_units']]), name="hid_b")
+
+        # Variables of the softmax layer
+        sm_w = tf.Variable(
+            tf.truncated_normal(
+                [PARAMS['hidden_units'], 10],
+                stddev=1.0 / math.sqrt(PARAMS['hidden_units'])),
+            name="sm_w")
+        sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
+
+        # Ops: located on the worker specified with FLAGS.task_index
+        x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
+        y_ = tf.placeholder(tf.float32, [None, 10])
+
+        hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
+        hid = tf.nn.relu(hid_lin)
+
+        y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
+        cross_entropy = -tf.reduce_sum(
+            y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
+
+        opt = tf.train.AdamOptimizer(PARAMS['learning_rate'])
+
+        if FLAGS.sync_replicas:
+            if FLAGS.replicas_to_aggregate is None:
+                replicas_to_aggregate = num_workers
+            else:
+                replicas_to_aggregate = FLAGS.replicas_to_aggregate
+
+            opt = tf.train.SyncReplicasOptimizer(
+                opt,
+                replicas_to_aggregate=replicas_to_aggregate,
+                total_num_replicas=num_workers,
+                name="mnist_sync_replicas")
+
+        train_step = opt.minimize(cross_entropy, global_step=global_step)
+
+        if FLAGS.sync_replicas:
+            local_init_op = opt.local_step_init_op
+            if is_chief:
+                local_init_op = opt.chief_init_op
+
+            ready_for_local_init_op = opt.ready_for_local_init_op
+
+            # Initial token and chief queue runners required by the sync_replicas mode
+            chief_queue_runner = opt.get_chief_queue_runner()
+            sync_init_op = opt.get_init_tokens_op()
+
+        init_op = tf.global_variables_initializer()
+        train_dir = tempfile.mkdtemp()
+
+        if FLAGS.sync_replicas:
+            sv = tf.train.Supervisor(
+                is_chief=is_chief,
+                logdir=train_dir,
+                init_op=init_op,
+                local_init_op=local_init_op,
+                ready_for_local_init_op=ready_for_local_init_op,
+                recovery_wait_secs=1,
+                global_step=global_step)
+        else:
+            sv = tf.train.Supervisor(
+                is_chief=is_chief,
+                logdir=train_dir,
+                init_op=init_op,
+                recovery_wait_secs=1,
+                global_step=global_step)
+
+        sess_config = tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=False,
+            device_filters=[
+                "/job:ps", "/job:worker/task:%d" % FLAGS.task_index
+            ])
+
+        # The chief worker (task_index==0) session will prepare the session,
+        # while the remaining workers will wait for the preparation to complete.
+        if is_chief:
+            print("Worker %d: Initializing session..." % FLAGS.task_index)
+        else:
+            print("Worker %d: Waiting for session to be initialized..." %
+                  FLAGS.task_index)
+
+        if FLAGS.existing_servers:
+            server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
+            print("Using existing server at: %s" % server_grpc_url)
+
+            sess = sv.prepare_or_wait_for_session(
+                server_grpc_url, config=sess_config)
+        else:
+            sess = sv.prepare_or_wait_for_session(
+                server.target, config=sess_config)
+
+        print("Worker %d: Session initialization complete." % FLAGS.task_index)
+
+        if FLAGS.sync_replicas and is_chief:
+            # Chief worker will start the chief queue runner and call the init op.
+            sess.run(sync_init_op)
+            sv.start_queue_runners(sess, [chief_queue_runner])
+
+        # Perform training
+        time_begin = time.time()
+        print("Training begins @ %f" % time_begin)
+
+        local_step = 0
+        while True:
+            # Training feed
+            batch_xs, batch_ys = mnist.train.next_batch(PARAMS['batch_size'])
+            train_feed = {x: batch_xs, y_: batch_ys}
+
+            _, step = sess.run([train_step, global_step], feed_dict=train_feed)
+            local_step += 1
+
+            now = time.time()
+            print("%f: Worker %d: training step %d done (global step: %d)" %
+                  (now, FLAGS.task_index, local_step, step))
+
+            if step > 0 and step % 5000 == 0 and is_chief:
+                val_feed = {
+                    x: mnist.validation.images,
+                    y_: mnist.validation.labels
+                }
+                interim_val_xent = sess.run(cross_entropy, feed_dict=val_feed)
+                print(
+                    "After %d training step(s), validation cross entropy = %g"
+                    % (step, interim_val_xent))
+
+                # Only chief worker can report intermediate metrics
+                nni.report_intermediate_result(interim_val_xent)
+
+            if step >= FLAGS.train_steps:
+                break
+
+        time_end = time.time()
+        print("Training ends @ %f" % time_end)
+        training_time = time_end - time_begin
+        print("Training elapsed time: %f s" % training_time)
+
+        # Validation feed
        val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
-        interim_val_xent = sess.run(cross_entropy, feed_dict=val_feed)
-        print("After %d training step(s), validation cross entropy = %g" % (step, interim_val_xent))
-
-        # Only chief worker can report intermediate metrics        
-        nni.report_intermediate_result(interim_val_xent)
-
-      if step >= FLAGS.train_steps:
-        break
-
-    time_end = time.time()
-    print("Training ends @ %f" % time_end)
-    training_time = time_end - time_begin
-    print("Training elapsed time: %f s" % training_time)
-
-    # Validation feed
-    val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
-    val_xent = sess.run(cross_entropy, feed_dict=val_feed)
-    print("After %d training step(s), validation cross entropy = %g" %
-          (FLAGS.train_steps, val_xent))
+        val_xent = sess.run(cross_entropy, feed_dict=val_feed)
+        print("After %d training step(s), validation cross entropy = %g" %
+              (FLAGS.train_steps, val_xent))

-    # Only chief worker can report final metrics
-    if is_chief:
-        nni.report_final_result(val_xent)
+        # Only chief worker can report final metrics
+        if is_chief:
+            nni.report_final_result(val_xent)


 if __name__ == "__main__":
-  tf.app.run()
+    tf.app.run()
--- a/examples/trials/mnist-hyperband/mnist.py
+++ b/examples/trials/mnist-hyperband/mnist.py
@@ -3,8 +3,9 @@
 import logging
 import math
 import tempfile
-import tensorflow as tf
+import time

+import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data

 import nni
@@ -142,13 +143,21 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    '''
    Main function, build mnist network, run and send result to NNI.
    '''
    # Import data
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')


--- a/examples/trials/mnist/mnist.py
+++ b/examples/trials/mnist/mnist.py
@@ -4,8 +4,9 @@ import argparse
 import logging
 import math
 import tempfile
-import tensorflow as tf
+import time

+import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data

 import nni
@@ -143,13 +144,21 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    '''
    Main function, build mnist network, run and send result to NNI.
    '''
    # Import data
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')


--- a/examples/trials/mnist/mnist_before.py
+++ b/examples/trials/mnist/mnist_before.py
@@ -3,8 +3,9 @@ import argparse
 import logging
 import math
 import tempfile
-import tensorflow as tf
+import time

+import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data

 FLAGS = None
@@ -143,13 +144,21 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    '''
    Main function, build mnist network, run and send result to NNI.
    '''
    # Import data
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')


--- a/src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
+++ b/src/sdk/pynni/nni/curvefitting_assessor/curvefitting_assessor.py
@@ -57,8 +57,8 @@ class CurvefittingAssessor(Assessor):
        self.threshold = threshold
        # Record the number of gap
        self.gap = gap
-        # Record the number of times of judgments
-        self.judgment_num = 0
+        # Record the number of intermediate result in the lastest judgment
+        self.last_judgment_num = dict()
        # Record the best performance
        self.set_best_performance = False
        self.completed_best_performance = None
@@ -112,9 +112,10 @@ class CurvefittingAssessor(Assessor):
        curr_step = len(trial_history)
        if curr_step < self.start_step:
            return AssessResult.Good
-        if (curr_step - self.start_step) // self.gap <= self.judgment_num:
+        
+        if trial_job_id in self.last_judgment_num.keys() and curr_step - self.last_judgment_num[trial_job_id] < self.gap:
            return AssessResult.Good
-        self.judgment_num = (curr_step - self.start_step) // self.gap
+        self.last_judgment_num[trial_job_id] = curr_step

        try:
            start_time = datetime.datetime.now()

--- a/src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
+++ b/src/sdk/pynni/nni/metis_tuner/Regression_GP/OutlierDetection.py
@@ -37,13 +37,13 @@ def _outlierDetection_threaded(inputs):
    sys.stderr.write("[%s] DEBUG: Evaluating %dth of %d samples\n"\
                        % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
    outlier = None
-    
+
    # Create a diagnostic regression model which removes the sample that we want to evaluate
-    diagnostic_regressor_gp = gp_create_model.createModel(\
+    diagnostic_regressor_gp = gp_create_model.create_model(\
                                    samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\
                                    samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
    mu, sigma = gp_prediction.predict(samples_x[samples_idx], diagnostic_regressor_gp['model'])
-    
+
    # 2.33 is the z-score for 98% confidence level
    if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
        outlier = {"samples_idx": samples_idx,
@@ -51,26 +51,26 @@ def _outlierDetection_threaded(inputs):
                   "expected_sigma": sigma,
                   "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)}
    return outlier
-    
+
 def outlierDetection_threaded(samples_x, samples_y_aggregation):
-    ''' 
+    '''
    Use Multi-thread to detect the outlier
    '''
    outliers = []
-    
+
    threads_inputs = [[samples_idx, samples_x, samples_y_aggregation]\
                            for samples_idx in range(0, len(samples_x))]
    threads_pool = ThreadPool(min(4, len(threads_inputs)))
    threads_results = threads_pool.map(_outlierDetection_threaded, threads_inputs)
    threads_pool.close()
    threads_pool.join()
-    
+
    for threads_result in threads_results:
        if threads_result is not None:
            outliers.append(threads_result)
        else:
            print("error here.")
-    
+
    outliers = None if len(outliers) == 0 else outliers
    return outliers

@@ -79,21 +79,19 @@ def outlierDetection(samples_x, samples_y_aggregation):
    '''
    outliers = []
    for samples_idx in range(0, len(samples_x)):
-        #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n" 
+        #sys.stderr.write("[%s] DEBUG: Evaluating %d of %d samples\n"
        #  \ % (os.path.basename(__file__), samples_idx + 1, len(samples_x)))
-        diagnostic_regressor_gp = gp_create_model.createModel(\
+        diagnostic_regressor_gp = gp_create_model.create_model(\
                                        samples_x[0:samples_idx] + samples_x[samples_idx + 1:],\
                                        samples_y_aggregation[0:samples_idx] + samples_y_aggregation[samples_idx + 1:])
        mu, sigma = gp_prediction.predict(samples_x[samples_idx],
-                                                     diagnostic_regressor_gp['model'])
+                                          diagnostic_regressor_gp['model'])
        # 2.33 is the z-score for 98% confidence level
        if abs(samples_y_aggregation[samples_idx] - mu) > (2.33 * sigma):
            outliers.append({"samples_idx": samples_idx,
                             "expected_mu": mu,
                             "expected_sigma": sigma,
                             "difference": abs(samples_y_aggregation[samples_idx] - mu) - (2.33 * sigma)})
-    
+
    outliers = None if len(outliers) == 0 else outliers
    return outliers
-
-    
\ No newline at end of file
--- a/src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+++ b/src/sdk/pynni/nni/metis_tuner/metis_tuner.py
@@ -24,22 +24,20 @@ import os
 import random
 import statistics
 import sys
-
-import numpy as np
-
 from enum import Enum, unique
 from multiprocessing.dummy import Pool as ThreadPool

-from nni.tuner import Tuner
+import numpy as np

-import nni.metis_tuner.lib_data as lib_data
 import nni.metis_tuner.lib_constraint_summation as lib_constraint_summation
-import nni.metis_tuner.Regression_GP.CreateModel as gp_create_model
-import nni.metis_tuner.Regression_GP.Selection as gp_selection
-import nni.metis_tuner.Regression_GP.Prediction as gp_prediction
-import nni.metis_tuner.Regression_GP.OutlierDetection as gp_outlier_detection
+import nni.metis_tuner.lib_data as lib_data
 import nni.metis_tuner.Regression_GMM.CreateModel as gmm_create_model
 import nni.metis_tuner.Regression_GMM.Selection as gmm_selection
+import nni.metis_tuner.Regression_GP.CreateModel as gp_create_model
+import nni.metis_tuner.Regression_GP.OutlierDetection as gp_outlier_detection
+import nni.metis_tuner.Regression_GP.Prediction as gp_prediction
+import nni.metis_tuner.Regression_GP.Selection as gp_selection
+from nni.tuner import Tuner

 logger = logging.getLogger("Metis_Tuner_AutoML")

@@ -67,33 +65,37 @@ class MetisTuner(Tuner):
    """

    def __init__(self, optimize_mode="maximize", no_resampling=True, no_candidates=True,
-                 selection_num_starting_points=10, cold_start_num=10):
+                 selection_num_starting_points=600, cold_start_num=10, exploration_probability=0.1):
        """
        Parameters
        ----------
        optimize_mode : str
            optimize_mode is a string that including two mode "maximize" and "minimize"
-        
+
        no_resampling : bool
            True or False. Should Metis consider re-sampling as part of the search strategy?
        If you are confident that the training dataset is noise-free, then you do not need re-sampling.
-        
+
        no_candidates: bool
            True or False. Should Metis suggest parameters for the next benchmark?
        If you do not plan to do more benchmarks, Metis can skip this step.
-        
+
        selection_num_starting_points: int
            how many times Metis should try to find the global optimal in the search space?
        The higher the number, the longer it takes to output the solution.
-        
+
        cold_start_num: int
            Metis need some trial result to get cold start. when the number of trial result is less than
        cold_start_num, Metis will randomly sample hyper-parameter for trial.
+
+        exploration_probability: float
+            The probability of Metis to select parameter from exploration instead of exploitation.
        """

        self.samples_x = []
        self.samples_y = []
        self.samples_y_aggregation = []
+        self.history_parameters = []
        self.space = None
        self.no_resampling = no_resampling
        self.no_candidates = no_candidates
@@ -101,6 +103,7 @@ class MetisTuner(Tuner):
        self.key_order = []
        self.cold_start_num = cold_start_num
        self.selection_num_starting_points = selection_num_starting_points
+        self.exploration_probability = exploration_probability
        self.minimize_constraints_fun = None
        self.minimize_starting_points = None

@@ -128,7 +131,7 @@ class MetisTuner(Tuner):
                except Exception as ex:
                    logger.exception(ex)
                    raise RuntimeError("The format search space contains \
-                                        some key that didn't define in key_order.")
+                                        some key that didn't define in key_order."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  )

                if key_type == 'quniform':
                    if key_range[2] == 1:
@@ -191,7 +194,7 @@ class MetisTuner(Tuner):
        Parameters
        ----------
        parameter_id : int
-    
+
        Returns
        -------
        result : dict
@@ -200,13 +203,15 @@ class MetisTuner(Tuner):
            init_parameter = _rand_init(self.x_bounds, self.x_types, 1)[0]
            results = self._pack_output(init_parameter)
        else:
+            self.minimize_starting_points = _rand_init(self.x_bounds, self.x_types, \
+                                                       self.selection_num_starting_points)
            results = self._selection(self.samples_x, self.samples_y_aggregation, self.samples_y,
                                      self.x_bounds, self.x_types,
                                      threshold_samplessize_resampling=(None if self.no_resampling is True else 50),
                                      no_candidates=self.no_candidates,
                                      minimize_starting_points=self.minimize_starting_points,
                                      minimize_constraints_fun=self.minimize_constraints_fun)
-        
+
        logger.info("Generate paramageters:\n" + str(results))
        return results

@@ -245,7 +250,7 @@ class MetisTuner(Tuner):

            # calculate y aggregation
            median = get_median(temp_y)
-            self.samples_y_aggregation[idx] = median
+            self.samples_y_aggregation[idx] = [median]
        else:
            self.samples_x.append(sample_x)
            self.samples_y.append([value])
@@ -264,17 +269,21 @@ class MetisTuner(Tuner):
        candidates = []
        samples_size_all = sum([len(i) for i in samples_y])
        samples_size_unique = len(samples_y)
-        
+
        # ===== STEP 1: Compute the current optimum =====
        #sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
        gp_model = gp_create_model.create_model(samples_x, samples_y_aggregation)
-        lm_current = gp_selection.selection("lm", samples_y_aggregation, x_bounds,
-                                                       x_types, gp_model['model'],
-                                                       minimize_starting_points,
-                                                       minimize_constraints_fun=minimize_constraints_fun)
+        lm_current = gp_selection.selection(
+            "lm",
+            samples_y_aggregation,
+            x_bounds,
+            x_types,
+            gp_model['model'],
+            minimize_starting_points,
+            minimize_constraints_fun=minimize_constraints_fun)
        if not lm_current:
            return None
-        
+
        if no_candidates is False:
            candidates.append({'hyperparameter': lm_current['hyperparameter'],
                               'expected_mu': lm_current['expected_mu'],
@@ -284,10 +293,14 @@ class MetisTuner(Tuner):
            # ===== STEP 2: Get recommended configurations for exploration =====
            #sys.stderr.write("[%s] Getting candidates for exploration...\n"
            #% \(os.path.basename(__file__)))
-            results_exploration = gp_selection.selection("lc", samples_y_aggregation,
-                                                                    x_bounds, x_types, gp_model['model'],
-                                                                    minimize_starting_points,
-                                                                    minimize_constraints_fun=minimize_constraints_fun)
+            results_exploration = gp_selection.selection(
+                "lc",
+                samples_y_aggregation,
+                x_bounds,
+                x_types,
+                gp_model['model'],
+                minimize_starting_points,
+                minimize_constraints_fun=minimize_constraints_fun)

            if results_exploration is not None:
                if _num_past_samples(results_exploration['hyperparameter'], samples_x, samples_y) == 0:
@@ -308,12 +321,13 @@ class MetisTuner(Tuner):
                print("Getting candidates for exploitation...\n")
                try:
                    gmm = gmm_create_model.create_model(samples_x, samples_y_aggregation)
-                    results_exploitation = gmm_selection.selection(x_bounds,
-                                                                              x_types,
-                                                                              gmm['clusteringmodel_good'],
-                                                                              gmm['clusteringmodel_bad'],
-                                                                              minimize_starting_points,
-                                                                              minimize_constraints_fun=minimize_constraints_fun)
+                    results_exploitation = gmm_selection.selection(
+                        x_bounds,
+                        x_types,
+                        gmm['clusteringmodel_good'],
+                        gmm['clusteringmodel_bad'],
+                        minimize_starting_points,
+                        minimize_constraints_fun=minimize_constraints_fun)

                    if results_exploitation is not None:
                        if _num_past_samples(results_exploitation['hyperparameter'], samples_x, samples_y) == 0:
@@ -326,9 +340,9 @@ class MetisTuner(Tuner):
                        logger.info("DEBUG: No suitable exploitation_gmm candidates were found\n")

                except ValueError as exception:
-                    # The exception: ValueError: Fitting the mixture model failed 
-                    # because some components have ill-defined empirical covariance 
-                    # (for instance caused by singleton or collapsed samples). 
+                    # The exception: ValueError: Fitting the mixture model failed
+                    # because some components have ill-defined empirical covariance
+                    # (for instance caused by singleton or collapsed samples).
                    # Try to decrease the number of components, or increase reg_covar.
                    logger.info("DEBUG: No suitable exploitation_gmm candidates were found due to exception.")
                    logger.info(exception)
@@ -340,8 +354,6 @@ class MetisTuner(Tuner):
                results_outliers = gp_outlier_detection.outlierDetection_threaded(samples_x, samples_y_aggregation)

                if results_outliers is not None:
-                    #temp = len(candidates)
-
                    for results_outlier in results_outliers:
                        if _num_past_samples(samples_x[results_outlier['samples_idx']], samples_x, samples_y) < max_resampling_per_x:
                            candidates.append({'hyperparameter': samples_x[results_outlier['samples_idx']],\
@@ -357,7 +369,10 @@ class MetisTuner(Tuner):
                logger.info("Evaluating information gain of %d candidates...\n")
                next_improvement = 0

-                threads_inputs = [[candidate, samples_x, samples_y, x_bounds, x_types, minimize_constraints_fun, minimize_starting_points] for candidate in candidates]
+                threads_inputs = [[
+                    candidate, samples_x, samples_y, x_bounds, x_types,
+                    minimize_constraints_fun, minimize_starting_points
+                ] for candidate in candidates]
                threads_pool = ThreadPool(4)
                # Evaluate what would happen if we actually sample each candidate
                threads_results = threads_pool.map(_calculate_lowest_mu_threaded, threads_inputs)
@@ -368,21 +383,23 @@ class MetisTuner(Tuner):
                    if threads_result['expected_lowest_mu'] < lm_current['expected_mu']:
                        # Information gain
                        temp_improvement = threads_result['expected_lowest_mu'] - lm_current['expected_mu']
-    
+
                        if next_improvement > temp_improvement:
-                            # logger.info("DEBUG: \"next_candidate\" changed: \
-                            #                 lowest mu might reduce from %f (%s) to %f (%s), %s\n" %\
-                            #                 lm_current['expected_mu'], str(lm_current['hyperparameter']),\
-                            #                 threads_result['expected_lowest_mu'],\
-                            #                 str(threads_result['candidate']['hyperparameter']),\
-                            #                 threads_result['candidate']['reason'])
+                            logger.info("DEBUG: \"next_candidate\" changed: \
+                                            lowest mu might reduce from %f (%s) to %f (%s), %s\n"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          %\
+                                            lm_current['expected_mu'], str(lm_current['hyperparameter']),\
+                                            threads_result['expected_lowest_mu'],\
+                                            str(threads_result['candidate']['hyperparameter']),\
+                                            threads_result['candidate']['reason'])

                            next_improvement = temp_improvement
                            next_candidate = threads_result['candidate']
            else:
                # ===== STEP 6: If we have no candidates, randomly pick one =====
-                logger.info("DEBUG: No candidates from exploration, exploitation,\
-                                 and resampling. We will random a candidate for next_candidate\n")
+                logger.info(
+                    "DEBUG: No candidates from exploration, exploitation,\
+                                 and resampling. We will random a candidate for next_candidate\n"
+                )

                next_candidate = _rand_with_constraints(x_bounds, x_types) \
                                    if minimize_starting_points is None else minimize_starting_points[0]
@@ -391,7 +408,16 @@ class MetisTuner(Tuner):
                next_candidate = {'hyperparameter': next_candidate, 'reason': "random",
                                  'expected_mu': expected_mu, 'expected_sigma': expected_sigma}

+        # ===== STEP 7: If current optimal hyperparameter occurs in the history or exploration probability is less than the threshold, take next config as exploration step  =====
        outputs = self._pack_output(lm_current['hyperparameter'])
+        ap = random.uniform(0, 1)
+        if outputs in self.history_parameters or ap<=self.exploration_probability:
+            if next_candidate is not None:
+                outputs = self._pack_output(next_candidate['hyperparameter'])
+            else:
+                random_parameter = _rand_init(self.x_bounds, self.x_types, 1)[0]
+                outputs = self._pack_output(random_parameter)
+        self.history_parameters.append(outputs)
        return outputs


@@ -437,10 +463,14 @@ def _calculate_lowest_mu_threaded(inputs):
        # Aggregates multiple observation of the sample sampling points
        temp_y_aggregation = [statistics.median(temp_sample_y) for temp_sample_y in temp_samples_y]
        temp_gp = gp_create_model.create_model(temp_samples_x, temp_y_aggregation)
-        temp_results = gp_selection.selection("lm", temp_y_aggregation,
-                                                         x_bounds, x_types, temp_gp['model'],
-                                                         minimize_starting_points,
-                                                         minimize_constraints_fun=minimize_constraints_fun)
+        temp_results = gp_selection.selection(
+            "lm",
+            temp_y_aggregation,
+            x_bounds,
+            x_types,
+            temp_gp['model'],
+            minimize_starting_points,
+            minimize_constraints_fun=minimize_constraints_fun)

        if outputs["expected_lowest_mu"] is None or outputs["expected_lowest_mu"] > temp_results['expected_mu']:
            outputs["expected_lowest_mu"] = temp_results['expected_mu']

--- a/src/webui/src/components/TrialsDetail.tsx
+++ b/src/webui/src/components/TrialsDetail.tsx
@@ -92,7 +92,7 @@ class TrialsDetail extends React.Component<{}, TrialDetailState> {
                            formatter: function (data: TooltipForAccuracy) {
                                const result = '<div class="tooldetailAccuracy">' +
                                    '<div>Trial No: ' + data.data[0] + '</div>' +
-                                    '<div>Default Metrc: ' + data.data[1] + '</div>' +
+                                    '<div>Default Metric: ' + data.data[1] + '</div>' +
                                    '<div>Parameters: ' +
                                    '<pre>' + JSON.stringify(data.data[2], null, 4) + '</pre>' +
                                    '</div>' +

--- a/test/config_test/examples/mnist-annotation.test.yml
+++ b/test/config_test/examples/mnist-annotation.test.yml
 authorName: nni
 experimentName: default_test
 maxExecDuration: 5m
-maxTrialNum: 2
-trialConcurrency: 1
+maxTrialNum: 4
+trialConcurrency: 2

 tuner:
  builtinTunerName: Random

--- a/test/config_test/examples/mnist.test.yml
+++ b/test/config_test/examples/mnist.test.yml
 authorName: nni
 experimentName: default_test
 maxExecDuration: 5m
-maxTrialNum: 2
-trialConcurrency: 1
+maxTrialNum: 4
+trialConcurrency: 2
 searchSpacePath: ./mnist_search_space.json

 tuner:

--- a/test/config_test/multi_phase/multi_phase.test.yml
+++ b/test/config_test/multi_phase/multi_phase.test.yml
 authorName: nni
 experimentName: default_test
 maxExecDuration: 5m
-maxTrialNum: 16
-trialConcurrency: 8
+maxTrialNum: 8
+trialConcurrency: 4
 searchSpacePath: ./search_space.json

 tuner:

--- a/test/pipelines-it-kubeflow.yml
+++ b/test/pipelines-it-kubeflow.yml
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 jobs:
 - job: 'integration_test_kubeflow'
  timeoutInMinutes: 0
-  pool: 'NNI CI KUBE CLI'
-
-  variables:
-    new_docker_img: msranni/nni.it.kb:latest

  steps:
  - script: python3 -m pip install --upgrade pip setuptools --user
@@ -18,20 +34,6 @@ jobs:
    condition: eq( variables['build_docker_img'], 'true' )
    displayName: 'build nni bdsit_wheel'

-  - script: |
-      cd deployment/pypi
-      docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
-
-      echo 'updating docker file for installing nni from local...'
-      # update Dockerfile to install NNI in docker image from whl file built in last step
-      sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
-      cat ../docker/Dockerfile
-      echo $IMG_TAG
-      docker build -f ../docker/Dockerfile -t $(new_docker_img) .
-      docker push $(new_docker_img)
-    condition: eq( variables['build_docker_img'], 'true' )
-    displayName: 'build and upload nni docker image'
-
  - script: |
      source install.sh
    displayName: 'Install nni toolkit via source code'
@@ -39,7 +41,18 @@ jobs:
  - script: |
      if [ $(build_docker_img) = 'true' ]
      then
-        export TEST_IMG=$(new_docker_img)
+        cd deployment/pypi
+        docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
+
+        echo 'updating docker file for installing nni from local...'
+        # update Dockerfile to install NNI in docker image from whl file built in last step
+        sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
+        cat ../docker/Dockerfile
+        export IMG_TAG=`date -u +%y%m%d%H%M`
+        docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
+        docker push $(test_docker_img_name):$IMG_TAG
+        export TEST_IMG=$(test_docker_img_name):$IMG_TAG
+        cd ../../
      else
        export TEST_IMG=$(existing_docker_img)
      fi

--- a/test/pipelines-it-pai.yml
+++ b/test/pipelines-it-pai.yml
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 jobs:
 - job: 'integration_test_pai'
  timeoutInMinutes: 0
-  pool: 'NNI CI PAI CLI'
-
-  variables:
-    new_docker_img: msranni/nni.it.pai:latest

  steps:
  - script: python3 -m pip install --upgrade pip setuptools --user
@@ -18,20 +34,6 @@ jobs:
    condition: eq( variables['build_docker_img'], 'true' )
    displayName: 'build nni bdsit_wheel'

-  - script: |
-      cd deployment/pypi
-      docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
-
-      echo 'updating docker file for installing nni from local...'
-      # update Dockerfile to install NNI in docker image from whl file built in last step
-      sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
-      cat ../docker/Dockerfile
-      echo $IMG_TAG
-      docker build -f ../docker/Dockerfile -t $(new_docker_img) .
-      docker push $(new_docker_img)
-    condition: eq( variables['build_docker_img'], 'true' )
-    displayName: 'build and upload nni docker image'
-
  - script: |
      source install.sh
    displayName: 'Install nni toolkit via source code'
@@ -39,10 +41,24 @@ jobs:
  - script: |
      if [ $(build_docker_img) = 'true' ]
      then
-        export TEST_IMG=$(new_docker_img)
+        cd deployment/pypi
+        docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
+        echo 'updating docker file for installing nni from local...'
+        # update Dockerfile to install NNI in docker image from whl file built in last step
+        sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
+        cat ../docker/Dockerfile
+        export IMG_TAG=`date -u +%y%m%d%H%M`
+
+        echo 'build and upload docker image'
+        docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
+        docker push $(test_docker_img_name):$IMG_TAG
+
+        export TEST_IMG=$(test_docker_img_name):$IMG_TAG
+        cd ../../
      else
        export TEST_IMG=$(existing_docker_img)
      fi
+
      echo "TEST_IMG:$TEST_IMG"
      cd test
      python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \

--- a/tools/nni_annotation/README_zh_CN.md
+++ b/tools/nni_annotation/README_zh_CN.md
-# NNI Annotation 介绍
+# NNI Annotation

-为了获得良好的用户体验并减少用户负担，NNI 设计了通过注释来使用的语法。
+## 概述

-使用 NNI 时，只需要:
+为了获得良好的用户体验并减少对以后代码的影响，NNI 设计了通过 Annotation（标记）来使用的语法。 通过 Annotation，只需要在代码中加入一些注释字符串，就能启用 NNI，完全不影响代码原先的执行逻辑。

-1. 在超参变量前加上如下标记：
-    
-    '''@nni.variable(nni.choice(2,3,5,7),name=self.conv_size)'''
+样例如下：

-2. 在中间结果前加上：
-    
-    '''@nni.report_intermediate_result(test_acc)'''
+```python
+'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
+learning_rate = 0.1
+```

-3. 在输出结果前加上：
-    
-    '''@nni.report_final_result(test_acc)'''
+此样例中，NNI 会从 (0.1, 0.01, 0.001) 中选择一个值赋给 learning_rate 变量。 第一行就是 NNI 的 Annotation，是 Python 中的一个字符串。 接下来的一行需要是赋值语句。 NNI 会根据 Annotation 行的信息，来给这一行的变量赋上相应的值。

-4. 在代码中使用函数 `function_choice`：
-    
-    '''@nni.function_choice(max_pool(h_conv1, self.pool_size),avg_pool(h_conv1, self.pool_size),name=max_pool)'''
+通过这种方式，不需要修改任何代码，代码既可以直接运行，又可以使用 NNI 来调参。

-通过这种方法，能够轻松的在 NNI 中实现自动调参。
+## Annotation 的类型：

-`@nni.variable`, `nni.choice` 为搜索空间的类型，通过以下 10 种方法来定义搜索空间：
+NNI 中，有 4 种类型的 Annotation；

-1. `@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)`  
-    变量值是选项中的一种，这些变量可以是任意的表达式。
+### 1. 变量

-2. `@nni.variable(nni.randint(upper),name=variable)`  
-    变量可以是范围 [0, upper) 中的任意整数。
+`'''@nni.variable(sampling_algo, name)'''`

-3. `@nni.variable(nni.uniform(low, high),name=variable)`  
-    变量值会是 low 和 high 之间均匀分布的某个值。
+`@nni.variable` 用来标记变量。

-4. `@nni.variable(nni.quniform(low, high, q),name=variable)`  
-    变量值会是 low 和 high 之间均匀分布的某个值，公式为：round(uniform(low, high) / q) * q
+**参数**

-5. `@nni.variable(nni.loguniform(low, high),name=variable)`  
-    变量值是 exp(uniform(low, high)) 的点，数值以对数均匀分布。
+- **sampling_algo**: 指定搜索空间的采样算法。 可将其换成 NNI 支持的其它采样函数，函数要以 `nni.` 开头。例如，`choice` 或 `uniform`，详见 [SearchSpaceSpec](https://nni.readthedocs.io/zh/latest/SearchSpaceSpec.html)。 
+- **name**: 将被赋值的变量名称。 注意，此参数应该与下面一行等号左边的值相同。

-6. `@nni.variable(nni.qloguniform(low, high, q),name=variable)`  
-    变量值会是 low 和 high 之间均匀分布的某个值，公式为：round(exp(uniform(low, high)) / q) * q
+NNI 支持如下 10 种类型来表示搜索空间：

-7. `@nni.variable(nni.normal(label, mu, sigma),name=variable)`  
-    变量值为正态分布的实数值，平均值为 mu，标准方差为 sigma。
+- `@nni.variable(nni.choice(option1,option2,...,optionN),name=variable)` 变量值是选项中的一种，这些变量可以是任意的表达式。
+- `@nni.variable(nni.randint(upper),name=variable)` 变量可以是范围 [0, upper) 中的任意整数。
+- `@nni.variable(nni.uniform(low, high),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值。
+- `@nni.variable(nni.quniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值，公式为：round(uniform(low, high) / q) * q
+- `@nni.variable(nni.loguniform(low, high),name=variable)` 变量值是 exp(uniform(low, high)) 的点，数值以对数均匀分布。
+- `@nni.variable(nni.qloguniform(low, high, q),name=variable)` 变量值会是 low 和 high 之间均匀分布的某个值，公式为：round(exp(uniform(low, high)) / q) * q
+- `@nni.variable(nni.normal(mu, sigma),name=variable)` 变量值为正态分布的实数值，平均值为 mu，标准方差为 sigma。
+- `@nni.variable(nni.qnormal(mu, sigma, q),name=variable)` 变量值分布的公式为： round(normal(mu, sigma) / q) * q
+- `@nni.variable(nni.lognormal(mu, sigma),name=variable)` 变量值分布的公式为： exp(normal(mu, sigma))
+- `@nni.variable(nni.qlognormal(mu, sigma, q),name=variable)` 变量值分布的公式为： round(exp(normal(mu, sigma)) / q) * q

-8. `@nni.variable(nni.qnormal(label, mu, sigma, q),name=variable)`  
-    变量值分布的公式为： round(normal(mu, sigma) / q) * q
+样例如下：

-9. `@nni.variable(nni.lognormal(label, mu, sigma),name=variable)`  
-    变量值分布的公式为： exp(normal(mu, sigma))
+```python
+'''@nni.variable(nni.choice(0.1, 0.01, 0.001), name=learning_rate)'''
+learning_rate = 0.1
+```

-10. `@nni.variable(nni.qlognormal(label, mu, sigma, q),name=variable)`  
-    变量值分布的公式为： round(exp(normal(mu, sigma)) / q) * q
\ No newline at end of file
+### 2. 函数
+
+`'''@nni.function_choice(*functions, name)'''`
+
+`@nni.function_choice` 可以从几个函数中选择一个来执行。
+
+**参数**
+
+- **functions**: 可选择的函数。 注意，必须是包括参数的完整函数调用。 例如 `max_pool(hidden_layer, pool_size)`。
+- **name**: 将被替换的函数名称。
+
+例如：
+
+```python
+"""@nni.function_choice(max_pool(hidden_layer, pool_size), avg_pool(hidden_layer, pool_size), name=max_pool)"""
+h_pooling = max_pool(hidden_layer, pool_size)
+```
+
+### 3. 中间结果
+
+`'''@nni.report_intermediate_result(metrics)'''`
+
+`@nni.report_intermediate_result` 用来返回中间结果，这和 [Trials.md](https://nni.readthedocs.io/zh/latest/Trials.html) 中的 `nni.report_intermediate_result` 用法一样。
+
+### 4. 最终结果
+
+`'''@nni.report_final_result(metrics)'''`
+
+`@nni.report_final_result` 用来返回当前 Trial 的最终结果，这和 [Trials.md](https://nni.readthedocs.io/zh/latest/Trials.html) 中的 `nni.report_final_result` 用法一样。
\ No newline at end of file
--- a/tools/nni_annotation/examples/mnist_generated.py
+++ b/tools/nni_annotation/examples/mnist_generated.py
-import nni
 """A deep MNIST classifier using convolutional layers."""
 import logging
 import math
 import tempfile
+import time
+
 import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data
+
+import nni
+
 FLAGS = None
 logger = logging.getLogger('mnist_AutoML')

@@ -123,12 +127,23 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    """
    Main function, build mnist network, run and send result to NNI.
    """
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+
+def main(params):
+    # Import data
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')
    mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],

--- a/tools/nni_annotation/examples/mnist_with_annotation.py
+++ b/tools/nni_annotation/examples/mnist_with_annotation.py
@@ -21,8 +21,9 @@
 import logging
 import math
 import tempfile
-import tensorflow as tf
+import time

+import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data

 FLAGS = None
@@ -168,13 +169,21 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    '''
    Main function, build mnist network, run and send result to NNI.
    '''
    # Import data
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')


--- a/tools/nni_annotation/examples/mnist_without_annotation.py
+++ b/tools/nni_annotation/examples/mnist_without_annotation.py
@@ -21,8 +21,9 @@
 import logging
 import math
 import tempfile
-import tensorflow as tf
+import time

+import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data

 import nni
@@ -172,13 +173,21 @@ def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

+def download_mnist_retry(data_dir, max_num_retries=20):
+    """Try to download mnist dataset and avoid errors"""
+    for _ in range(max_num_retries):
+        try:
+            return input_data.read_data_sets(data_dir, one_hot=True)
+        except tf.errors.AlreadyExistsError:
+            time.sleep(1)
+    raise Exception("Failed to download MNIST.")

 def main(params):
    '''
    Main function, build mnist network, run and send result to NNI.
    '''
    # Import data
-    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    mnist = download_mnist_retry(params['data_dir'])
    print('Mnist download data done.')
    logger.debug('Mnist download data done.')