Merge dev-nas-tuner back to master (#1531)

* PPO tuner for NAS, supports NNI's NAS interface (#1380)

Merge dev-nas-tuner back to master (#1531)
* PPO tuner for NAS, supports NNI's NAS interface (#1380)
55f48d27 · QuanluZhang · GitHub · 7246593f · 55f48d27 · 55f48d27
Unverified Commit 55f48d27 authored Sep 16, 2019 by QuanluZhang Committed by GitHub Sep 16, 2019
20 changed files
--- a/examples/trials/nas_cifar10/src/cifar10_flags.py
+++ b/examples/trials/nas_cifar10/src/cifar10_flags.py
+import tensorflow as tf
+from src.utils import DEFINE_boolean
+from src.utils import DEFINE_float
+from src.utils import DEFINE_integer
+from src.utils import DEFINE_string
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+
+DEFINE_boolean("reset_output_dir", False, "Delete output_dir if exists.")
+DEFINE_string("data_path", "", "")
+DEFINE_string("output_dir", "", "")
+DEFINE_string("data_format", "NHWC", "'NHWC' or 'NCWH'")
+DEFINE_string("search_for", None, "Must be [macro|micro]")
+DEFINE_integer("train_data_size", 45000, "")
+DEFINE_integer("batch_size", 32, "")
+
+DEFINE_integer("num_epochs", 300, "")
+DEFINE_integer("child_lr_dec_every", 100, "")
+DEFINE_integer("child_num_layers", 5, "")
+DEFINE_integer("child_num_cells", 5, "")
+DEFINE_integer("child_filter_size", 5, "")
+DEFINE_integer("child_out_filters", 48, "")
+DEFINE_integer("child_out_filters_scale", 1, "")
+DEFINE_integer("child_num_branches", 4, "")
+DEFINE_integer("child_num_aggregate", None, "")
+DEFINE_integer("child_num_replicas", 1, "")
+DEFINE_integer("child_block_size", 3, "")
+DEFINE_integer("child_lr_T_0", None, "for lr schedule")
+DEFINE_integer("child_lr_T_mul", None, "for lr schedule")
+DEFINE_integer("child_cutout_size", None, "CutOut size")
+DEFINE_float("child_grad_bound", 5.0, "Gradient clipping")
+DEFINE_float("child_lr", 0.1, "")
+DEFINE_float("child_lr_dec_rate", 0.1, "")
+DEFINE_float("child_keep_prob", 0.5, "")
+DEFINE_float("child_drop_path_keep_prob", 1.0, "minimum drop_path_keep_prob")
+DEFINE_float("child_l2_reg", 1e-4, "")
+DEFINE_float("child_lr_max", None, "for lr schedule")
+DEFINE_float("child_lr_min", None, "for lr schedule")
+DEFINE_string("child_skip_pattern", None, "Must be ['dense', None]")
+DEFINE_string("child_fixed_arc", None, "")
+DEFINE_boolean("child_use_aux_heads", False, "Should we use an aux head")
+DEFINE_boolean("child_sync_replicas", False, "To sync or not to sync.")
+DEFINE_boolean("child_lr_cosine", False, "Use cosine lr schedule")
+DEFINE_integer("log_every", 50, "How many steps to log")
+DEFINE_integer("eval_every_epochs", 1, "How many epochs to eval")
--- a/examples/trials/nas_cifar10/src/common_ops.py
+++ b/examples/trials/nas_cifar10/src/common_ops.py
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.training import moving_averages
+
+
+def lstm(x, prev_c, prev_h, w):
+    ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w)
+    i, f, o, g = tf.split(ifog, 4, axis=1)
+    i = tf.sigmoid(i)
+    f = tf.sigmoid(f)
+    o = tf.sigmoid(o)
+    g = tf.tanh(g)
+    next_c = i * g + f * prev_c
+    next_h = o * tf.tanh(next_c)
+    return next_c, next_h
+
+
+def stack_lstm(x, prev_c, prev_h, w):
+    next_c, next_h = [], []
+    for layer_id, (_c, _h, _w) in enumerate(zip(prev_c, prev_h, w)):
+        inputs = x if layer_id == 0 else next_h[-1]
+        curr_c, curr_h = lstm(inputs, _c, _h, _w)
+        next_c.append(curr_c)
+        next_h.append(curr_h)
+    return next_c, next_h
+
+
+def create_weight(name, shape, initializer=None, trainable=True, seed=None):
+    if initializer is None:
+        initializer = tf.contrib.keras.initializers.he_normal(seed=seed)
+    return tf.get_variable(name, shape, initializer=initializer, trainable=trainable)
+
+
+def create_bias(name, shape, initializer=None):
+    if initializer is None:
+        initializer = tf.constant_initializer(0.0, dtype=tf.float32)
+    return tf.get_variable(name, shape, initializer=initializer)
+
+
+def conv_op(inputs, filter_size, is_training, count, out_filters,
+                     data_format, ch_mul=1, start_idx=None, separable=False):
+    """
+    Args:
+        start_idx: where to start taking the output channels. if None, assuming
+            fixed_arc mode
+        count: how many output_channels to take.
+    """
+
+    if data_format == "NHWC":
+        inp_c = inputs.get_shape()[3].value
+    elif data_format == "NCHW":
+        inp_c = inputs.get_shape()[1].value
+
+    with tf.variable_scope("inp_conv_1"):
+        w = create_weight("w", [1, 1, inp_c, out_filters])
+        x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1],
+                            "SAME", data_format=data_format)
+        x = batch_norm(x, is_training, data_format=data_format)
+        x = tf.nn.relu(x)
+
+    with tf.variable_scope("out_conv_{}".format(filter_size)):
+        if start_idx is None:
+            if separable:
+                w_depth = create_weight(
+                    "w_depth", [filter_size, filter_size, out_filters, ch_mul])
+                w_point = create_weight(
+                    "w_point", [1, 1, out_filters * ch_mul, count])
+                x = tf.nn.separable_conv2d(x, w_depth, w_point, strides=[1, 1, 1, 1],
+                                            padding="SAME", data_format=data_format)
+                x = batch_norm(
+                    x, is_training, data_format=data_format)
+            else:
+                w = create_weight(
+                    "w", [filter_size, filter_size, inp_c, count])
+                x = tf.nn.conv2d(
+                    x, w, [1, 1, 1, 1], "SAME", data_format=data_format)
+                x = batch_norm(
+                    x, is_training, data_format=data_format)
+        else:
+            if separable:
+                w_depth = create_weight(
+                    "w_depth", [filter_size, filter_size, out_filters, ch_mul])
+                #test_depth = w_depth
+                w_point = create_weight(
+                    "w_point", [out_filters, out_filters * ch_mul])
+                w_point = w_point[start_idx:start_idx+count, :]
+                w_point = tf.transpose(w_point, [1, 0])
+                w_point = tf.reshape(
+                    w_point, [1, 1, out_filters * ch_mul, count])
+
+                x = tf.nn.separable_conv2d(x, w_depth, w_point, strides=[1, 1, 1, 1],
+                                            padding="SAME", data_format=data_format)
+                mask = tf.range(0, out_filters, dtype=tf.int32)
+                mask = tf.logical_and(
+                    start_idx <= mask, mask < start_idx + count)
+                x = batch_norm_with_mask(
+                    x, is_training, mask, out_filters, data_format=data_format)
+            else:
+                w = create_weight(
+                    "w", [filter_size, filter_size, out_filters, out_filters])
+                w = tf.transpose(w, [3, 0, 1, 2])
+                w = w[start_idx:start_idx+count, :, :, :]
+                w = tf.transpose(w, [1, 2, 3, 0])
+                x = tf.nn.conv2d(
+                    x, w, [1, 1, 1, 1], "SAME", data_format=data_format)
+                mask = tf.range(0, out_filters, dtype=tf.int32)
+                mask = tf.logical_and(
+                    start_idx <= mask, mask < start_idx + count)
+                x = batch_norm_with_mask(
+                    x, is_training, mask, out_filters, data_format=data_format)
+        x = tf.nn.relu(x)
+    return x
+
+def pool_op(inputs, is_training, count, out_filters, avg_or_max, data_format, start_idx=None):
+    """
+    Args:
+        start_idx: where to start taking the output channels. if None, assuming
+            fixed_arc mode
+        count: how many output_channels to take.
+    """
+
+    if data_format == "NHWC":
+        inp_c = inputs.get_shape()[3].value
+    elif data_format == "NCHW":
+        inp_c = inputs.get_shape()[1].value
+
+    with tf.variable_scope("conv_1"):
+        w = create_weight("w", [1, 1, inp_c, out_filters])
+        x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1],
+                            "SAME", data_format=data_format)
+        x = batch_norm(x, is_training, data_format=data_format)
+        x = tf.nn.relu(x)
+
+    with tf.variable_scope("pool"):
+        if data_format == "NHWC":
+            actual_data_format = "channels_last"
+        elif data_format == "NCHW":
+            actual_data_format = "channels_first"
+
+        if avg_or_max == "avg":
+            x = tf.layers.average_pooling2d(
+                x, [3, 3], [1, 1], "SAME", data_format=actual_data_format)
+        elif avg_or_max == "max":
+            x = tf.layers.max_pooling2d(
+                x, [3, 3], [1, 1], "SAME", data_format=actual_data_format)
+        else:
+            raise ValueError("Unknown pool {}".format(avg_or_max))
+
+        if start_idx is not None:
+            if data_format == "NHWC":
+                x = x[:, :, :, start_idx: start_idx+count]
+            elif data_format == "NCHW":
+                x = x[:, start_idx: start_idx+count, :, :]
+
+    return x
+
+
+def global_avg_pool(x, data_format="NHWC"):
+    if data_format == "NHWC":
+        x = tf.reduce_mean(x, [1, 2])
+    elif data_format == "NCHW":
+        x = tf.reduce_mean(x, [2, 3])
+    else:
+        raise NotImplementedError("Unknown data_format {}".format(data_format))
+    return x
+
+
+def batch_norm(x, is_training, name="bn", decay=0.9, epsilon=1e-5,
+               data_format="NHWC"):
+    if data_format == "NHWC":
+        shape = [x.get_shape()[3]]
+    elif data_format == "NCHW":
+        shape = [x.get_shape()[1]]
+    else:
+        raise NotImplementedError("Unknown data_format {}".format(data_format))
+
+    with tf.variable_scope(name, reuse=None if is_training else True):
+        offset = tf.get_variable(
+            "offset", shape,
+            initializer=tf.constant_initializer(0.0, dtype=tf.float32))
+        scale = tf.get_variable(
+            "scale", shape,
+            initializer=tf.constant_initializer(1.0, dtype=tf.float32))
+        moving_mean = tf.get_variable(
+            "moving_mean", shape, trainable=False,
+            initializer=tf.constant_initializer(0.0, dtype=tf.float32))
+        moving_variance = tf.get_variable(
+            "moving_variance", shape, trainable=False,
+            initializer=tf.constant_initializer(1.0, dtype=tf.float32))
+
+        if is_training:
+            x, mean, variance = tf.nn.fused_batch_norm(
+                x, scale, offset, epsilon=epsilon, data_format=data_format,
+                is_training=True)
+            update_mean = moving_averages.assign_moving_average(
+                moving_mean, mean, decay)
+            update_variance = moving_averages.assign_moving_average(
+                moving_variance, variance, decay)
+            with tf.control_dependencies([update_mean, update_variance]):
+                x = tf.identity(x)
+        else:
+            x, _, _ = tf.nn.fused_batch_norm(x, scale, offset, mean=moving_mean,
+                                             variance=moving_variance,
+                                             epsilon=epsilon, data_format=data_format,
+                                             is_training=False)
+    return x
+
+
+def batch_norm_with_mask(x, is_training, mask, num_channels, name="bn",
+                         decay=0.9, epsilon=1e-3, data_format="NHWC"):
+
+    shape = [num_channels]
+    indices = tf.where(mask)
+    indices = tf.to_int32(indices)
+    indices = tf.reshape(indices, [-1])
+
+    with tf.variable_scope(name, reuse=None if is_training else True):
+        offset = tf.get_variable(
+            "offset", shape,
+            initializer=tf.constant_initializer(0.0, dtype=tf.float32))
+        scale = tf.get_variable(
+            "scale", shape,
+            initializer=tf.constant_initializer(1.0, dtype=tf.float32))
+        offset = tf.boolean_mask(offset, mask)
+        scale = tf.boolean_mask(scale, mask)
+
+        moving_mean = tf.get_variable(
+            "moving_mean", shape, trainable=False,
+            initializer=tf.constant_initializer(0.0, dtype=tf.float32))
+        moving_variance = tf.get_variable(
+            "moving_variance", shape, trainable=False,
+            initializer=tf.constant_initializer(1.0, dtype=tf.float32))
+
+        if is_training:
+            x, mean, variance = tf.nn.fused_batch_norm(
+                x, scale, offset, epsilon=epsilon, data_format=data_format,
+                is_training=True)
+            mean = (1.0 - decay) * (tf.boolean_mask(moving_mean, mask) - mean)
+            variance = (1.0 - decay) * \
+                (tf.boolean_mask(moving_variance, mask) - variance)
+            update_mean = tf.scatter_sub(
+                moving_mean, indices, mean, use_locking=True)
+            update_variance = tf.scatter_sub(
+                moving_variance, indices, variance, use_locking=True)
+            with tf.control_dependencies([update_mean, update_variance]):
+                x = tf.identity(x)
+        else:
+            masked_moving_mean = tf.boolean_mask(moving_mean, mask)
+            masked_moving_variance = tf.boolean_mask(moving_variance, mask)
+            x, _, _ = tf.nn.fused_batch_norm(x, scale, offset,
+                                             mean=masked_moving_mean,
+                                             variance=masked_moving_variance,
+                                             epsilon=epsilon, data_format=data_format,
+                                             is_training=False)
+    return x
--- a/examples/trials/nas_cifar10/src/utils.py
+++ b/examples/trials/nas_cifar10/src/utils.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import numpy as np
+import tensorflow as tf
+
+
+user_flags = []
+
+
+def DEFINE_string(name, default_value, doc_string):
+    tf.app.flags.DEFINE_string(name, default_value, doc_string)
+    global user_flags
+    user_flags.append(name)
+
+
+def DEFINE_integer(name, default_value, doc_string):
+    tf.app.flags.DEFINE_integer(name, default_value, doc_string)
+    global user_flags
+    user_flags.append(name)
+
+
+def DEFINE_float(name, default_value, doc_string):
+    tf.app.flags.DEFINE_float(name, default_value, doc_string)
+    global user_flags
+    user_flags.append(name)
+
+
+def DEFINE_boolean(name, default_value, doc_string):
+    tf.app.flags.DEFINE_boolean(name, default_value, doc_string)
+    global user_flags
+    user_flags.append(name)
+
+
+def print_user_flags(line_limit=80):
+    print("-" * 80)
+
+    global user_flags
+    FLAGS = tf.app.flags.FLAGS
+
+    for flag_name in sorted(user_flags):
+        value = "{}".format(getattr(FLAGS, flag_name))
+        log_string = flag_name
+        log_string += "." * (line_limit - len(flag_name) - len(value))
+        log_string += value
+        print(log_string)
+
+
+def get_C(x, data_format):
+    """
+    Args:
+        x: tensor of shape [N, H, W, C] or [N, C, H, W]
+    """
+    if data_format == "NHWC":
+        return x.get_shape()[3].value
+    elif data_format == "NCHW":
+        return x.get_shape()[1].value
+    else:
+        raise ValueError(
+            "Unknown data_format '{0}'".format(data_format))
+
+def get_HW(x, data_format):
+    """
+    Args:
+        x: tensor of shape [N, H, W, C] or [N, C, H, W]
+    """
+    return x.get_shape()[2].value
+
+def get_strides(stride, data_format):
+    """
+    Args:
+        x: tensor of shape [N, H, W, C] or [N, C, H, W]
+    """
+    if data_format == "NHWC":
+        return [1, stride, stride, 1]
+    elif data_format == "NCHW":
+        return [1, 1, stride, stride]
+    else:
+        raise ValueError(
+            "Unknown data_format '{0}'".format(data_format))
+
+
+class TextColors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+class Logger(object):
+    def __init__(self, output_file):
+        self.terminal = sys.stdout
+        self.log = open(output_file, "a")
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.terminal.flush()
+        self.log.write(message)
+        self.log.flush()
+
+
+def count_model_params(tf_variables):
+    """
+    Args:
+        tf_variables: list of all model variables
+    """
+
+    num_vars = 0
+    for var in tf_variables:
+        num_vars += np.prod([dim.value for dim in var.get_shape()])
+    return num_vars
+
+
+def get_train_ops(
+        loss,
+        tf_variables,
+        train_step,
+        clip_mode=None,
+        grad_bound=None,
+        l2_reg=1e-4,
+        lr_warmup_val=None,
+        lr_warmup_steps=100,
+        lr_init=0.1,
+        lr_dec_start=0,
+        lr_dec_every=10000,
+        lr_dec_rate=0.1,
+        lr_dec_min=None,
+        lr_cosine=False,
+        lr_max=None,
+        lr_min=None,
+        lr_T_0=None,
+        lr_T_mul=None,
+        num_train_batches=None,
+        optim_algo=None,
+        sync_replicas=False,
+        num_aggregate=None,
+        num_replicas=None,
+        get_grad_norms=False,
+        moving_average=None):
+    """
+    Args:
+        clip_mode: "global", "norm", or None.
+        moving_average: store the moving average of parameters
+    """
+
+    if l2_reg > 0:
+        l2_losses = []
+        for var in tf_variables:
+            l2_losses.append(tf.reduce_sum(var ** 2))
+        l2_loss = tf.add_n(l2_losses)
+        loss += l2_reg * l2_loss
+
+    grads = tf.gradients(loss, tf_variables)
+    grad_norm = tf.global_norm(grads)
+
+    grad_norms = {}
+    for v, g in zip(tf_variables, grads):
+        if v is None or g is None:
+            continue
+        if isinstance(g, tf.IndexedSlices):
+            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values ** 2))
+        else:
+            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g ** 2))
+
+    if clip_mode is not None:
+        assert grad_bound is not None, "Need grad_bound to clip gradients."
+        if clip_mode == "global":
+            grads, _ = tf.clip_by_global_norm(grads, grad_bound)
+        elif clip_mode == "norm":
+            clipped = []
+            for g in grads:
+                if isinstance(g, tf.IndexedSlices):
+                    c_g = tf.clip_by_norm(g.values, grad_bound)
+                    c_g = tf.IndexedSlices(g.indices, c_g)
+                else:
+                    c_g = tf.clip_by_norm(g, grad_bound)
+                clipped.append(g)
+            grads = clipped
+        else:
+            raise NotImplementedError("Unknown clip_mode {}".format(clip_mode))
+
+    if lr_cosine:
+        assert lr_max is not None, "Need lr_max to use lr_cosine"
+        assert lr_min is not None, "Need lr_min to use lr_cosine"
+        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
+        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
+        assert num_train_batches is not None, ("Need num_train_batches to use"
+                                               " lr_cosine")
+
+        curr_epoch = train_step // num_train_batches
+
+        last_reset = tf.Variable(0, dtype=tf.int32, trainable=False,
+                                 name="last_reset")
+        T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i")
+        T_curr = curr_epoch - last_reset
+
+        def _update():
+            update_last_reset = tf.assign(
+                last_reset, curr_epoch, use_locking=True)
+            update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True)
+            with tf.control_dependencies([update_last_reset, update_T_i]):
+                rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
+                lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
+            return lr
+
+        def _no_update():
+            rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
+            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
+            return lr
+
+        learning_rate = tf.cond(
+            tf.greater_equal(T_curr, T_i), _update, _no_update)
+    else:
+        learning_rate = tf.train.exponential_decay(
+            lr_init, tf.maximum(train_step - lr_dec_start, 0), lr_dec_every,
+            lr_dec_rate, staircase=True)
+        if lr_dec_min is not None:
+            learning_rate = tf.maximum(learning_rate, lr_dec_min)
+
+    if lr_warmup_val is not None:
+        learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps),
+                                lambda: lr_warmup_val, lambda: learning_rate)
+
+    if optim_algo == "momentum":
+        opt = tf.train.MomentumOptimizer(
+            learning_rate, 0.9, use_locking=True, use_nesterov=True)
+    elif optim_algo == "sgd":
+        opt = tf.train.GradientDescentOptimizer(
+            learning_rate, use_locking=True)
+    elif optim_algo == "adam":
+        opt = tf.train.AdamOptimizer(learning_rate, beta1=0.0, epsilon=1e-3,
+                                     use_locking=True)
+    else:
+        raise ValueError("Unknown optim_algo {}".format(optim_algo))
+
+    if sync_replicas:
+        assert num_aggregate is not None, "Need num_aggregate to sync."
+        assert num_replicas is not None, "Need num_replicas to sync."
+
+        opt = tf.train.SyncReplicasOptimizer(
+            opt,
+            replicas_to_aggregate=num_aggregate,
+            total_num_replicas=num_replicas,
+            use_locking=True)
+
+    if moving_average is not None:
+        opt = tf.contrib.opt.MovingAverageOptimizer(
+            opt, average_decay=moving_average)
+
+    train_op = opt.apply_gradients(
+        zip(grads, tf_variables), global_step=train_step)
+
+    if get_grad_norms:
+        return train_op, learning_rate, grad_norm, opt, grad_norms
+    else:
+        return train_op, learning_rate, grad_norm, opt
--- a/examples/tuners/random_nas_tuner/random_nas_tuner.py
+++ b/examples/tuners/random_nas_tuner/random_nas_tuner.py
@@ -2,13 +2,14 @@ import numpy as np

 from nni.tuner import Tuner

+
 def random_archi_generator(nas_ss, random_state):
    '''random
    '''
    chosen_archi = {}
-    print("zql: nas search space: ", nas_ss)
    for block_name, block_value in nas_ss.items():
-        assert block_value['_type'] == "mutable_layer", "Random NAS Tuner only receives NAS search space whose _type is 'mutable_layer'"
+        assert block_value['_type'] == "mutable_layer", \
+            "Random NAS Tuner only receives NAS search space whose _type is 'mutable_layer'"
        block = block_value['_value']
        tmp_block = {}
        for layer_name, layer in block.items():
@@ -19,13 +20,12 @@ def random_archi_generator(nas_ss, random_state):
                    tmp_layer['chosen_layer'] = value[index]
                elif key == 'optional_inputs':
                    tmp_layer['chosen_inputs'] = []
-                    print("zql: optional_inputs", layer['optional_inputs'])
                    if layer['optional_inputs']:
                        if isinstance(layer['optional_input_size'], int):
                            choice_num = layer['optional_input_size']
                        else:
                            choice_range = layer['optional_input_size']
-                            choice_num = random_state.randint(choice_range[0], choice_range[1]+1)
+                            choice_num = random_state.randint(choice_range[0], choice_range[1] + 1)
                        for _ in range(choice_num):
                            index = random_state.randint(len(layer['optional_inputs']))
                            tmp_layer['chosen_inputs'].append(layer['optional_inputs'][index])
@@ -37,6 +37,7 @@ def random_archi_generator(nas_ss, random_state):
        chosen_archi[block_name] = tmp_block
    return chosen_archi

+
 class RandomNASTuner(Tuner):
    '''RandomNASTuner
    '''

--- a/src/nni_manager/rest_server/restValidationSchemas.ts
+++ b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -174,7 +174,7 @@ export namespace ValidationSchemas {
                checkpointDir: joi.string().allow('')
            }),
            tuner: joi.object({
-                builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner'),
+                builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner', 'PPOTuner'),
                codeDir: joi.string(),
                classFileName: joi.string(),
                className: joi.string(),

--- a/src/sdk/pynni/nni/constants.py
+++ b/src/sdk/pynni/nni/constants.py
@@ -30,7 +30,8 @@ ModuleName = {
    'NetworkMorphism': 'nni.networkmorphism_tuner.networkmorphism_tuner',
    'Curvefitting': 'nni.curvefitting_assessor.curvefitting_assessor',
    'MetisTuner': 'nni.metis_tuner.metis_tuner',
-    'GPTuner': 'nni.gp_tuner.gp_tuner'
+    'GPTuner': 'nni.gp_tuner.gp_tuner',
+    'PPOTuner': 'nni.ppo_tuner.ppo_tuner'
 }

 ClassName = {
@@ -44,6 +45,7 @@ ClassName = {
    'NetworkMorphism':'NetworkMorphismTuner',
    'MetisTuner':'MetisTuner',
    'GPTuner':'GPTuner',
+    'PPOTuner': 'PPOTuner',

    'Medianstop': 'MedianstopAssessor',
    'Curvefitting': 'CurvefittingAssessor'

--- a/src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
+++ b/src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
@@ -27,6 +27,7 @@ import logging
 import hyperopt as hp
 import numpy as np
 from nni.tuner import Tuner
+from nni.nas_utils import rewrite_nas_space
 from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index

 logger = logging.getLogger('hyperopt_AutoML')
@@ -240,6 +241,7 @@ class HyperoptTuner(Tuner):
            return hp.anneal.suggest
        raise RuntimeError('Not support tuner algorithm in hyperopt.')

+    @rewrite_nas_space
    def update_search_space(self, search_space):
        """
        Update search space definition in tuner by search_space in parameters.

--- a/src/sdk/pynni/nni/msg_dispatcher.py
+++ b/src/sdk/pynni/nni/msg_dispatcher.py
@@ -101,11 +101,16 @@ class MsgDispatcher(MsgDispatcherBase):
        self.tuner.update_search_space(data)
        send(CommandType.Initialized, '')

+    def send_trial_callback(self, id, params):
+        """For tuner to issue trial config when the config is generated
+        """
+        send(CommandType.NewTrialJob, _pack_parameter(id, params))
+
    def handle_request_trial_jobs(self, data):
        # data: number or trial jobs
        ids = [_create_parameter_id() for _ in range(data)]
        _logger.debug("requesting for generating params of {}".format(ids))
-        params_list = self.tuner.generate_multiple_parameters(ids)
+        params_list = self.tuner.generate_multiple_parameters(ids, st_callback=self.send_trial_callback)

        for i, _ in enumerate(params_list):
            send(CommandType.NewTrialJob, _pack_parameter(ids[i], params_list[i]))

--- a/src/sdk/pynni/nni/nas_utils.py
+++ b/src/sdk/pynni/nni/nas_utils.py
@@ -17,10 +17,16 @@
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
 # OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # ==================================================================================================
+import functools
+import logging

 from . import trial


+_logger = logging.getLogger(__name__)
+_MUTABLE_LAYER_SPACE_PREFIX = "_mutable_layer"
+
+
 def classic_mode(
        mutable_id,
        mutable_layer_id,
@@ -34,13 +40,11 @@ def classic_mode(
    without touching the full model graph.'''
    if trial.get_current_parameter() is None:
        trial.get_next_parameter()
-    mutable_block = trial.get_current_parameter(mutable_id)
-    chosen_layer = mutable_block[mutable_layer_id]["chosen_layer"]
-    chosen_inputs = mutable_block[mutable_layer_id]["chosen_inputs"]
-    real_chosen_inputs = [optional_inputs[input_name]
-                          for input_name in chosen_inputs]
-    layer_out = funcs[chosen_layer](
-        [fixed_inputs, real_chosen_inputs], **funcs_args[chosen_layer])
+
+    chosen_layer, chosen_inputs = _get_layer_and_inputs_from_tuner(mutable_id, mutable_layer_id,
+                                                                   list(optional_inputs.keys()))
+    real_chosen_inputs = [optional_inputs[input_name] for input_name in chosen_inputs]
+    layer_out = funcs[chosen_layer]([fixed_inputs, real_chosen_inputs], **funcs_args[chosen_layer])

    return layer_out

@@ -173,20 +177,44 @@ def reload_tensorflow_variables(tf, session):
    tf: tensorflow module
    '''
    subgraph_from_tuner = trial.get_next_parameter()
-    for mutable_id, mutable_block in subgraph_from_tuner.items():
+    mutable_layers = set()
+    for subgraph_key in subgraph_from_tuner:
+        if "/" in subgraph_key:
+            # has to remove the last, could be layer_choice or whatever
+            mutable_id, mutable_layer_id = _decompose_general_key(subgraph_key[:subgraph_key.rfind("/")])
+            if mutable_id is not None:
+                mutable_layers.add((mutable_id, mutable_layer_id))
+    mutable_layers = sorted(list(mutable_layers))
+    for mutable_id, mutable_layer_id in mutable_layers:
        if mutable_id not in name_space:
+            _logger.warning("{} not found in name space".format(mutable_id))
            continue
-        for mutable_layer_id, mutable_layer in mutable_block.items():
-            name_prefix = "{}_{}".format(mutable_id, mutable_layer_id)
-            # extract layer information from the subgraph sampled by tuner
-            chosen_layer = name_space[name_prefix]['funcs'].index(
-                mutable_layer["chosen_layer"])
-            chosen_inputs = [1 if inp in mutable_layer["chosen_inputs"]
-                             else 0 for inp in name_space[name_prefix]['optional_inputs']]
-            # load these information into pre-defined tensorflow variables
-            tf_variables[name_prefix]['funcs'].load(chosen_layer, session)
-            tf_variables[name_prefix]['optional_inputs'].load(
-                chosen_inputs, session)
+        name_prefix = "{}_{}".format(mutable_id, mutable_layer_id)
+        # get optional inputs names
+        optional_inputs = name_space[name_prefix]['optional_inputs']
+        # extract layer information from the subgraph sampled by tuner
+        chosen_layer, chosen_inputs = _get_layer_and_inputs_from_tuner(mutable_id, mutable_layer_id, optional_inputs)
+        chosen_layer = name_space[name_prefix]['funcs'].index(chosen_layer)
+        chosen_inputs = [1 if inp in chosen_inputs else 0 for inp in optional_inputs]
+        # load these information into pre-defined tensorflow variables
+        tf_variables[name_prefix]['funcs'].load(chosen_layer, session)
+        tf_variables[name_prefix]['optional_inputs'].load(
+            chosen_inputs, session)
+
+
+def _construct_general_key(mutable_id, mutable_layer_id):
+    # Mutable layer key in a general (search space) format
+    # that is, prefix/mutable_id/mutable_layer_id
+    return _MUTABLE_LAYER_SPACE_PREFIX + "/" + mutable_id + "/" + mutable_layer_id
+
+
+def _decompose_general_key(key):
+    # inverse operation of above
+    if not key.startswith(_MUTABLE_LAYER_SPACE_PREFIX):
+        return None, None
+    else:
+        _, mutable_id, mutable_layer_id = key.split("/", maxsplit=2)
+        return mutable_id, mutable_layer_id


 def darts_training(tf, session, loss, feed_dict):
@@ -205,4 +233,107 @@ def training_update(nas_mode, tf=None, session=None, loss=None, feed_dict=None):
    if nas_mode == 'darts_mode':
        darts_training(tf, session, loss, feed_dict)
    elif nas_mode == 'enas_mode':
-        reload_tensorflow_variables(tf, session)
\ No newline at end of file
+        reload_tensorflow_variables(tf, session)
+
+
+def _get_layer_and_inputs_from_tuner(mutable_id, mutable_layer_id, optional_inputs):
+    # optional_inputs should be name(key)s of the optional inputs
+    try:
+        mutable_block = trial.get_current_parameter(mutable_id)
+
+        # There is a NAS tuner
+        chosen_layer = mutable_block[mutable_layer_id]["chosen_layer"]
+        chosen_inputs = mutable_block[mutable_layer_id]["chosen_inputs"]
+    except KeyError:
+        # Try to find converted NAS parameters
+        params = trial.get_current_parameter()
+        expected_prefix = _construct_general_key(mutable_id, mutable_layer_id)
+        chosen_layer = params[expected_prefix + "/layer_choice"]
+
+        # find how many to choose
+        optional_input_size = int(params[expected_prefix + "/optional_input_size"])  # convert uniform to randint
+
+        # find who to choose, can duplicate
+        optional_input_state = params[expected_prefix + "/optional_input_chosen_state"]
+        chosen_inputs = []
+        # make sure dict -> list produce stable result by sorting
+        optional_inputs_keys = sorted(optional_inputs)
+        for i in range(optional_input_size):
+            chosen_inputs.append(optional_inputs_keys[optional_input_state % len(optional_inputs)])
+            optional_input_state //= len(optional_inputs)
+
+    _logger.info("%s_%s: layer: %s, optional inputs: %s" % (mutable_id, mutable_layer_id,
+                                                            chosen_layer, chosen_inputs))
+    return chosen_layer, chosen_inputs
+
+
+def convert_nas_search_space(search_space):
+    """
+    Args:
+        param search_space: raw search space
+        return: the new search space, mutable_layers will be converted into choice
+    """
+    ret = dict()
+    for k, v in search_space.items():
+        if "_type" not in v:
+            # this should not happen
+            _logger.warning("There is no _type in one of your search space values with key '%s'"
+                            ". Please check your search space" % k)
+            ret[k] = v
+        elif v["_type"] != "mutable_layer":
+            ret[k] = v
+        else:
+            _logger.info("Converting mutable_layer search space with key '%s'" % k)
+            # v["_value"] looks like {'mutable_layer_1': {'layer_choice': ...} ...}
+            values = v["_value"]
+            for layer_name, layer_data in values.items():
+                # there should be at most layer_choice, optional_inputs, optional_input_size in layer_data
+
+                # add "_mutable_layer" as prefix so that they can be recovered later
+                layer_key = _construct_general_key(k, layer_name)
+
+                if layer_data.get("layer_choice"):  # filter out empty choice and no choice
+                    layer_choice = layer_data["layer_choice"]
+                else:
+                    raise ValueError("No layer choice found in %s" % layer_key)
+
+                if layer_data.get("optional_input_size"):
+                    input_size = layer_data["optional_input_size"]
+                    if isinstance(input_size, int):
+                        input_size = [input_size, input_size]
+                    if input_size[0] > input_size[1] or input_size[0] < 0:
+                        _logger.error("Might not be able to handle optional_input_size < 0, please double check")
+                    input_size[1] += 1
+                else:
+                    _logger.info("Optional input choices are set to empty by default in %s" % layer_key)
+                    input_size = [0, 1]
+
+                if layer_data.get("optional_inputs"):
+                    total_state_size = len(layer_data["optional_inputs"]) ** (input_size[1] - 1)
+                else:
+                    _logger.info("Optional inputs not found in %s" % layer_key)
+                    total_state_size = 1
+
+                converted = {
+                    layer_key + "/layer_choice": {
+                        "_type": "choice", "_value": layer_choice
+                    },
+                    layer_key + "/optional_input_size": {
+                        "_type": "randint", "_value": input_size
+                    },
+                    layer_key + "/optional_input_chosen_state": {
+                        "_type": "randint", "_value": [0, total_state_size]
+                    }
+                }
+                _logger.info(converted)
+                ret.update(converted)
+
+    return ret
+
+
+def rewrite_nas_space(func):
+    @functools.wraps(func)
+    def wrap(self, search_space):
+        search_space = convert_nas_search_space(search_space)
+        return func(self, search_space)
+    return wrap
--- a/src/sdk/pynni/nni/ppo_tuner/__init__.py
+++ b/src/sdk/pynni/nni/ppo_tuner/__init__.py
--- a/src/sdk/pynni/nni/ppo_tuner/distri.py
+++ b/src/sdk/pynni/nni/ppo_tuner/distri.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+functions for sampling from hidden state
+"""
+
+import tensorflow as tf
+
+from .util import fc
+
+
+class Pd:
+    """
+    A particular probability distribution
+    """
+    def flatparam(self):
+        raise NotImplementedError
+    def mode(self):
+        raise NotImplementedError
+    def neglogp(self, x):
+        # Usually it's easier to define the negative logprob
+        raise NotImplementedError
+    def kl(self, other):
+        raise NotImplementedError
+    def entropy(self):
+        raise NotImplementedError
+    def sample(self):
+        raise NotImplementedError
+    def logp(self, x):
+        return - self.neglogp(x)
+    def get_shape(self):
+        return self.flatparam().shape
+    @property
+    def shape(self):
+        return self.get_shape()
+    def __getitem__(self, idx):
+        return self.__class__(self.flatparam()[idx])
+
+class PdType:
+    """
+    Parametrized family of probability distributions
+    """
+    def pdclass(self):
+        raise NotImplementedError
+    def pdfromflat(self, flat, mask, nsteps, size, is_act_model):
+        return self.pdclass()(flat, mask, nsteps, size, is_act_model)
+    def pdfromlatent(self, latent_vector, init_scale, init_bias):
+        raise NotImplementedError
+    def param_shape(self):
+        raise NotImplementedError
+    def sample_shape(self):
+        raise NotImplementedError
+    def sample_dtype(self):
+        raise NotImplementedError
+
+    def param_placeholder(self, prepend_shape, name=None):
+        return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
+    def sample_placeholder(self, prepend_shape, name=None):
+        return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
+
+class CategoricalPd(Pd):
+    """
+    categorical prossibility distribution
+    """
+    def __init__(self, logits, mask_npinf, nsteps, size, is_act_model):
+        self.logits = logits
+        self.mask_npinf = mask_npinf
+        self.nsteps = nsteps
+        self.size = size
+        self.is_act_model = is_act_model
+    def flatparam(self):
+        return self.logits
+    def mode(self):
+        return tf.argmax(self.logits, axis=-1)
+
+    @property
+    def mean(self):
+        return tf.nn.softmax(self.logits)
+    def neglogp(self, x):
+        """
+        return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        Note: we can't use sparse_softmax_cross_entropy_with_logits because
+              the implementation does not allow second-order derivatives...
+        """
+        if x.dtype in {tf.uint8, tf.int32, tf.int64}:
+            # one-hot encoding
+            x_shape_list = x.shape.as_list()
+            logits_shape_list = self.logits.get_shape().as_list()[:-1]
+            for xs, ls in zip(x_shape_list, logits_shape_list):
+                if xs is not None and ls is not None:
+                    assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
+
+            x = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
+        else:
+            # already encoded
+            assert x.shape.as_list() == self.logits.shape.as_list()
+
+        return tf.nn.softmax_cross_entropy_with_logits_v2(
+            logits=self.logits,
+            labels=x)
+
+    def kl(self, other):
+        """kl"""
+        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
+        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        ea1 = tf.exp(a1)
+        z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
+        z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
+
+    def entropy(self):
+        """compute entropy"""
+        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
+
+    def sample(self):
+        """sample from logits"""
+        if not self.is_act_model:
+            re_res = tf.reshape(self.logits, [-1, self.nsteps, self.size])
+            masked_res = tf.math.add(re_res, self.mask_npinf)
+            re_masked_res = tf.reshape(masked_res, [-1, self.size])
+
+            u = tf.random_uniform(tf.shape(re_masked_res), dtype=self.logits.dtype)
+            return tf.argmax(re_masked_res - tf.log(-tf.log(u)), axis=-1)
+        else:
+            u = tf.random_uniform(tf.shape(self.logits), dtype=self.logits.dtype)
+            return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
+
+    @classmethod
+    def fromflat(cls, flat):
+        return cls(flat)
+
+class CategoricalPdType(PdType):
+    """
+    to create CategoricalPd
+    """
+    def __init__(self, ncat, nsteps, np_mask, is_act_model):
+        self.ncat = ncat
+        self.nsteps = nsteps
+        self.np_mask = np_mask
+        self.is_act_model = is_act_model
+    def pdclass(self):
+        return CategoricalPd
+
+    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        """add fc and create CategoricalPd"""
+        pdparam, mask, mask_npinf = _matching_fc(latent_vector, 'pi', self.ncat, self.nsteps,
+                                                 init_scale=init_scale, init_bias=init_bias,
+                                                 np_mask=self.np_mask, is_act_model=self.is_act_model)
+        return self.pdfromflat(pdparam, mask_npinf, self.nsteps, self.ncat, self.is_act_model), pdparam, mask, mask_npinf
+
+    def param_shape(self):
+        return [self.ncat]
+    def sample_shape(self):
+        return []
+    def sample_dtype(self):
+        return tf.int32
+
+def _matching_fc(tensor, name, size, nsteps, init_scale, init_bias, np_mask, is_act_model):
+    """
+    add fc op, and add mask op when not in action mode
+    """
+    if tensor.shape[-1] == size:
+        assert False
+        return tensor
+    else:
+        mask = tf.get_variable("act_mask", dtype=tf.float32, initializer=np_mask[0], trainable=False)
+        mask_npinf = tf.get_variable("act_mask_npinf", dtype=tf.float32, initializer=np_mask[1], trainable=False)
+        res = fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
+        if not is_act_model:
+            re_res = tf.reshape(res, [-1, nsteps, size])
+            masked_res = tf.math.multiply(re_res, mask)
+            re_masked_res = tf.reshape(masked_res, [-1, size])
+            return re_masked_res, mask, mask_npinf
+        else:
+            return res, mask, mask_npinf
--- a/src/sdk/pynni/nni/ppo_tuner/model.py
+++ b/src/sdk/pynni/nni/ppo_tuner/model.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+the main model of policy/value network
+"""
+
+import tensorflow as tf
+
+from .util import initialize, get_session
+
+class Model:
+    """
+    We use this object to :
+    __init__:
+    - Creates the step_model
+    - Creates the train_model
+
+    train():
+    - Make the training part (feedforward and retropropagation of gradients)
+
+    save/load():
+    - Save load the model
+    """
+    def __init__(self, *, policy, nbatch_act, nbatch_train,
+                 nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, np_mask=None):
+        """
+        init
+        """
+        self.sess = sess = get_session()
+
+        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            # CREATE OUR TWO MODELS
+            # act_model that is used for sampling
+            act_model = policy(nbatch_act, 1, sess, np_mask=np_mask, is_act_model=True)
+
+            # Train model for training
+            if microbatch_size is None:
+                train_model = policy(nbatch_train, nsteps, sess, np_mask=np_mask, is_act_model=False)
+            else:
+                train_model = policy(microbatch_size, nsteps, sess, np_mask=np_mask, is_act_model=False)
+
+        # CREATE THE PLACEHOLDERS
+        self.A = A = train_model.pdtype.sample_placeholder([None])
+        self.ADV = ADV = tf.placeholder(tf.float32, [None])
+        self.R = R = tf.placeholder(tf.float32, [None])
+        # Keep track of old actor
+        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        # Keep track of old critic
+        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
+        self.LR = LR = tf.placeholder(tf.float32, [])
+        # Cliprange
+        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.pd.neglogp(A)
+
+        # Calculate the entropy
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
+        entropy = tf.reduce_mean(train_model.pd.entropy())
+
+        # CALCULATE THE LOSS
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Clip the value to reduce variability during Critic training
+        # Get the predicted value
+        vpred = train_model.vf
+        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        # Unclipped value
+        vf_losses1 = tf.square(vpred - R)
+        # Clipped value
+        vf_losses2 = tf.square(vpredclipped - R)
+
+        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+
+        # Calculate ratio (pi current policy / pi old policy)
+        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+
+        # Defining Loss = - J is equivalent to max J
+        pg_losses = -ADV * ratio
+
+        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+
+        # Final PG loss
+        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+
+        # Total loss
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        # UPDATE THE PARAMETERS USING LOSS
+        # 1. Get the model parameters
+        params = tf.trainable_variables('ppo2_model')
+        # 2. Build our trainer
+        self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        # 3. Calculate the gradients
+        grads_and_var = self.trainer.compute_gradients(loss, params)
+        grads, var = zip(*grads_and_var)
+
+        if max_grad_norm is not None:
+            # Clip the gradients (normalize)
+            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+        grads_and_var = list(zip(grads, var))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+
+        self.grads = grads
+        self.var = var
+        self._train_op = self.trainer.apply_gradients(grads_and_var)
+        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
+        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]
+
+
+        self.train_model = train_model
+        self.act_model = act_model
+        self.step = act_model.step
+        self.value = act_model.value
+        self.initial_state = act_model.initial_state
+
+        initialize()
+
+    def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        """
+        train the model.
+        Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+        Returns = R + yV(s')
+        """
+        advs = returns - values
+
+        # Normalize the advantages
+        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
+
+        td_map = {
+            self.train_model.X : obs,
+            self.A : actions,
+            self.ADV : advs,
+            self.R : returns,
+            self.LR : lr,
+            self.CLIPRANGE : cliprange,
+            self.OLDNEGLOGPAC : neglogpacs,
+            self.OLDVPRED : values
+        }
+        if states is not None:
+            td_map[self.train_model.S] = states
+            td_map[self.train_model.M] = masks
+
+        return self.sess.run(
+            self.stats_list + [self._train_op],
+            td_map
+        )[:-1]
--- a/src/sdk/pynni/nni/ppo_tuner/policy.py
+++ b/src/sdk/pynni/nni/ppo_tuner/policy.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+build policy/value network from model
+"""
+
+import tensorflow as tf
+
+from .distri import CategoricalPdType
+from .util import lstm_model, fc, observation_placeholder, adjust_shape
+
+
+class PolicyWithValue:
+    """
+    Encapsulates fields and methods for RL policy and value function estimation with shared parameters
+    """
+
+    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, np_mask=None, is_act_model=False, **tensors):
+        """
+        Parameters:
+        ----------
+        env:             RL environment
+        observations:    tensorflow placeholder in which the observations will be fed
+        latent:          latent state from which policy distribution parameters should be inferred
+        vf_latent:       latent state from which value function should be inferred (if None, then latent is used)
+        sess:            tensorflow session to run calculations in (if None, default session is used)
+        **tensors:       tensorflow tensors for additional attributes such as state or mask
+        """
+
+        self.X = observations
+        self.state = tf.constant([])
+        self.initial_state = None
+        self.__dict__.update(tensors)
+
+        vf_latent = vf_latent if vf_latent is not None else latent
+
+        vf_latent = tf.layers.flatten(vf_latent)
+        latent = tf.layers.flatten(latent)
+
+        # Based on the action space, will select what probability distribution type
+        self.np_mask = np_mask
+        self.pdtype = CategoricalPdType(env.action_space.n, env.nsteps, np_mask, is_act_model)
+
+        self.act_latent = latent
+        self.nh = env.action_space.n
+
+        self.pd, self.pi, self.mask, self.mask_npinf = self.pdtype.pdfromlatent(latent, init_scale=0.01)
+
+        # Take an action
+        self.action = self.pd.sample()
+
+        # Calculate the neg log of our probability
+        self.neglogp = self.pd.neglogp(self.action)
+        self.sess = sess or tf.get_default_session()
+
+        assert estimate_q is False
+        self.vf = fc(vf_latent, 'vf', 1)
+        self.vf = self.vf[:, 0]
+
+        if is_act_model:
+            self._build_model_for_step()
+
+    def _evaluate(self, variables, observation, **extra_feed):
+        sess = self.sess
+        feed_dict = {self.X: adjust_shape(self.X, observation)}
+        for inpt_name, data in extra_feed.items():
+            if inpt_name in self.__dict__.keys():
+                inpt = self.__dict__[inpt_name]
+                if isinstance(inpt, tf.Tensor) and inpt._op.type == 'Placeholder':
+                    feed_dict[inpt] = adjust_shape(inpt, data)
+
+        return sess.run(variables, feed_dict)
+
+    def _build_model_for_step(self):
+        # multiply with weight and apply mask on self.act_latent to generate
+        self.act_step = step = tf.placeholder(shape=(), dtype=tf.int64, name='act_step')
+        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
+            from .util import ortho_init
+            nin = self.act_latent.get_shape()[1].value
+            w = tf.get_variable("w", [nin, self.nh], initializer=ortho_init(0.01))
+            b = tf.get_variable("b", [self.nh], initializer=tf.constant_initializer(0.0))
+            logits = tf.matmul(self.act_latent, w)+b
+            piece = tf.slice(self.mask, [step, 0], [1, self.nh])
+            re_piece = tf.reshape(piece, [-1])
+            masked_logits = tf.math.multiply(logits, re_piece)
+
+            npinf_piece = tf.slice(self.mask_npinf, [step, 0], [1, self.nh])
+            re_npinf_piece = tf.reshape(npinf_piece, [-1])
+
+        def sample(logits, mask_npinf):
+            new_logits = tf.math.add(logits, mask_npinf)
+            u = tf.random_uniform(tf.shape(new_logits), dtype=logits.dtype)
+            return tf.argmax(new_logits - tf.log(-tf.log(u)), axis=-1)
+
+        def neglogp(logits, x):
+            # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+            # Note: we can't use sparse_softmax_cross_entropy_with_logits because
+            #       the implementation does not allow second-order derivatives...
+            if x.dtype in {tf.uint8, tf.int32, tf.int64}:
+                # one-hot encoding
+                x_shape_list = x.shape.as_list()
+                logits_shape_list = logits.get_shape().as_list()[:-1]
+                for xs, ls in zip(x_shape_list, logits_shape_list):
+                    if xs is not None and ls is not None:
+                        assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
+
+                x = tf.one_hot(x, logits.get_shape().as_list()[-1])
+            else:
+                # already encoded
+                assert x.shape.as_list() == logits.shape.as_list()
+
+            return tf.nn.softmax_cross_entropy_with_logits_v2(
+                logits=logits,
+                labels=x)
+
+        self.act_action = sample(masked_logits, re_npinf_piece)
+        self.act_neglogp = neglogp(masked_logits, self.act_action)
+
+
+    def step(self, step, observation, **extra_feed):
+        """
+        Compute next action(s) given the observation(s)
+
+        Parameters:
+        ----------
+        observation:     observation data (either single or a batch)
+        **extra_feed:    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+
+        Returns:
+        -------
+        (action, value estimate, next state, negative log likelihood of the action under current policy parameters) tuple
+        """
+        extra_feed['act_step'] = step
+        a, v, state, neglogp = self._evaluate([self.act_action, self.vf, self.state, self.act_neglogp], observation, **extra_feed)
+        if state.size == 0:
+            state = None
+        return a, v, state, neglogp
+
+    def value(self, ob, *args, **kwargs):
+        """
+        Compute value estimate(s) given the observation(s)
+
+        Parameters:
+        ----------
+        observation:     observation data (either single or a batch)
+        **extra_feed:    additional data such as state or mask (names of the arguments should match the ones in constructor, see __init__)
+
+        Returns:
+        -------
+        value estimate
+        """
+        return self._evaluate(self.vf, ob, *args, **kwargs)
+
+
+def build_lstm_policy(model_config, value_network=None, estimate_q=False, **policy_kwargs):
+    """
+    build lstm policy and value network, they share the same lstm network.
+    the parameters all use their default values.
+    """
+    policy_network = lstm_model(**policy_kwargs)
+
+    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, np_mask=None, is_act_model=False):
+        ob_space = model_config.observation_space
+
+        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
+
+        extra_tensors = {}
+
+        # encode_observation is not necessary anymore as we use embedding_lookup
+        encoded_x = X
+
+        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
+            policy_latent = policy_network(encoded_x, 1, model_config.observation_space.n)
+            if isinstance(policy_latent, tuple):
+                policy_latent, recurrent_tensors = policy_latent
+
+                if recurrent_tensors is not None:
+                    # recurrent architecture, need a few more steps
+                    nenv = nbatch // nsteps
+                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
+                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv, model_config.observation_space.n)
+                    extra_tensors.update(recurrent_tensors)
+
+        _v_net = value_network
+
+        assert _v_net is None or _v_net == 'shared'
+        vf_latent = policy_latent
+
+        policy = PolicyWithValue(
+            env=model_config,
+            observations=X,
+            latent=policy_latent,
+            vf_latent=vf_latent,
+            sess=sess,
+            estimate_q=estimate_q,
+            np_mask=np_mask,
+            is_act_model=is_act_model,
+            **extra_tensors
+        )
+        return policy
+
+    return policy_fn
--- a/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py
+++ b/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py
--- a/src/sdk/pynni/nni/ppo_tuner/requirements.txt
+++ b/src/sdk/pynni/nni/ppo_tuner/requirements.txt
+enum34
+gym
+tensorflow
\ No newline at end of file
--- a/src/sdk/pynni/nni/ppo_tuner/util.py
+++ b/src/sdk/pynni/nni/ppo_tuner/util.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+util functions
+"""
+
+import os
+import random
+import multiprocessing
+import numpy as np
+import tensorflow as tf
+from gym.spaces import Discrete, Box, MultiDiscrete
+
+def set_global_seeds(i):
+    """set global seeds"""
+    rank = 0
+    myseed = i  + 1000 * rank if i is not None else None
+    tf.set_random_seed(myseed)
+    np.random.seed(myseed)
+    random.seed(myseed)
+
+def batch_to_seq(h, nbatch, nsteps, flat=False):
+    """convert from batch to sequence"""
+    if flat:
+        h = tf.reshape(h, [nbatch, nsteps])
+    else:
+        h = tf.reshape(h, [nbatch, nsteps, -1])
+    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
+
+def seq_to_batch(h, flat=False):
+    """convert from sequence to batch"""
+    shape = h[0].get_shape().as_list()
+    if not flat:
+        assert len(shape) > 1
+        nh = h[0].get_shape()[-1].value
+        return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
+    else:
+        return tf.reshape(tf.stack(values=h, axis=1), [-1])
+
+def lstm(xs, ms, s, scope, nh, init_scale=1.0):
+    """lstm cell"""
+    nbatch, nin = [v.value for v in xs[0].get_shape()]
+    with tf.variable_scope(scope):
+        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
+        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
+
+    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
+    for idx, (x, m) in enumerate(zip(xs, ms)):
+        c = c*(1-m)
+        h = h*(1-m)
+        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
+        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
+        i = tf.nn.sigmoid(i)
+        f = tf.nn.sigmoid(f)
+        o = tf.nn.sigmoid(o)
+        u = tf.tanh(u)
+        c = f*c + i*u
+        h = o*tf.tanh(c)
+        xs[idx] = h
+    s = tf.concat(axis=1, values=[c, h])
+    return xs, s
+
+def lstm_model(nlstm=128, layer_norm=False):
+    """
+    Builds LSTM (Long-Short Term Memory) network to be used in a policy.
+    Note that the resulting function returns not only the output of the LSTM
+    (i.e. hidden state of lstm for each step in the sequence), but also a dictionary
+    with auxiliary tensors to be set as policy attributes.
+
+    Specifically,
+        S is a placeholder to feed current state (LSTM state has to be managed outside policy)
+        M is a placeholder for the mask (used to mask out observations after the end of the episode, but can be used for other purposes too)
+        initial_state is a numpy array containing initial lstm state (usually zeros)
+        state is the output LSTM state (to be fed into S at the next call)
+
+
+    An example of usage of lstm-based policy can be found here: common/tests/test_doc_examples.py/test_lstm_example
+
+    Parameters:
+    ----------
+    nlstm: int          LSTM hidden state size
+    layer_norm: bool    if True, layer-normalized version of LSTM is used
+
+    Returns:
+    -------
+    function that builds LSTM with a given input tensor / placeholder
+    """
+
+    def network_fn(X, nenv=1, obs_size=-1):
+        with tf.variable_scope("emb", reuse=tf.AUTO_REUSE):
+            w_emb = tf.get_variable("w_emb", [obs_size+1, 32])
+            X = tf.nn.embedding_lookup(w_emb, X)
+
+        nbatch = X.shape[0]
+        nsteps = nbatch // nenv
+
+        h = tf.layers.flatten(X)
+
+        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
+        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
+
+        xs = batch_to_seq(h, nenv, nsteps)
+        ms = batch_to_seq(M, nenv, nsteps)
+
+        assert not layer_norm
+        h5, snew = lstm(xs, ms, S, scope='lstm', nh=nlstm)
+
+        h = seq_to_batch(h5)
+        initial_state = np.zeros(S.shape.as_list(), dtype=float)
+
+        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
+
+    return network_fn
+
+def ortho_init(scale=1.0):
+    """init approach"""
+    def _ortho_init(shape, dtype, partition_info=None):
+        #lasagne ortho init for tf
+        shape = tuple(shape)
+        if len(shape) == 2:
+            flat_shape = shape
+        elif len(shape) == 4: # assumes NHWC
+            flat_shape = (np.prod(shape[:-1]), shape[-1])
+        else:
+            raise NotImplementedError
+        a = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        q = u if u.shape == flat_shape else v # pick the one with the correct shape
+        q = q.reshape(shape)
+        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+    return _ortho_init
+
+def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
+    """fully connected op"""
+    with tf.variable_scope(scope):
+        nin = x.get_shape()[1].value
+        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
+        return tf.matmul(x, w)+b
+
+def _check_shape(placeholder_shape, data_shape):
+    """
+    check if two shapes are compatible (i.e. differ only by dimensions of size 1, or by the batch dimension)
+    """
+
+    return True
+
+# ================================================================
+# Shape adjustment for feeding into tf placeholders
+# ================================================================
+def adjust_shape(placeholder, data):
+    """
+    adjust shape of the data to the shape of the placeholder if possible.
+    If shape is incompatible, AssertionError is thrown
+
+    Parameters:
+    placeholder:     tensorflow input placeholder
+    data:            input data to be (potentially) reshaped to be fed into placeholder
+
+    Returns:
+    reshaped data
+    """
+    if not isinstance(data, np.ndarray) and not isinstance(data, list):
+        return data
+    if isinstance(data, list):
+        data = np.array(data)
+
+    placeholder_shape = [x or -1 for x in placeholder.shape.as_list()]
+
+    assert _check_shape(placeholder_shape, data.shape), \
+        'Shape of data {} is not compatible with shape of the placeholder {}'.format(data.shape, placeholder_shape)
+
+    return np.reshape(data, placeholder_shape)
+
+# ================================================================
+# Global session
+# ================================================================
+
+def get_session(config=None):
+    """Get default session or create one with a given config"""
+    sess = tf.get_default_session()
+    if sess is None:
+        sess = make_session(config=config, make_default=True)
+    return sess
+
+def make_session(config=None, num_cpu=None, make_default=False, graph=None):
+    """Returns a session that will use <num_cpu> CPU's only"""
+    if num_cpu is None:
+        num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
+    if config is None:
+        config = tf.ConfigProto(
+            allow_soft_placement=True,
+            inter_op_parallelism_threads=num_cpu,
+            intra_op_parallelism_threads=num_cpu)
+        config.gpu_options.allow_growth = True
+
+    if make_default:
+        return tf.InteractiveSession(config=config, graph=graph)
+    else:
+        return tf.Session(config=config, graph=graph)
+
+ALREADY_INITIALIZED = set()
+
+def initialize():
+    """Initialize all the uninitialized variables in the global scope."""
+    new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
+    get_session().run(tf.variables_initializer(new_variables))
+
+    ALREADY_INITIALIZED.update(new_variables)
+
+def observation_placeholder(ob_space, batch_size=None, name='Ob'):
+    """
+    Create placeholder to feed observations into of the size appropriate to the observation space
+
+    Parameters:
+    ----------
+    ob_space: gym.Space     observation space
+    batch_size: int         size of the batch to be fed into input. Can be left None in most cases.
+    name: str               name of the placeholder
+
+    Returns:
+    -------
+    tensorflow placeholder tensor
+    """
+
+    assert isinstance(ob_space, (Discrete, Box, MultiDiscrete)), \
+        'Can only deal with Discrete and Box observation spaces for now'
+
+    dtype = ob_space.dtype
+    if dtype == np.int8:
+        dtype = np.uint8
+
+    return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
+
+def explained_variance(ypred, y):
+    """
+    Computes fraction of variance that ypred explains about y.
+    Returns 1 - Var[y-ypred] / Var[y]
+
+    interpretation:
+        ev=0  =>  might as well have predicted zero
+        ev=1  =>  perfect prediction
+        ev<0  =>  worse than just predicting zero
+
+    """
+    assert y.ndim == 1 and ypred.ndim == 1
+    vary = np.var(y)
+    return np.nan if vary == 0 else 1 - np.var(y-ypred)/vary
--- a/src/sdk/pynni/nni/tuner.py
+++ b/src/sdk/pynni/nni/tuner.py
@@ -17,11 +17,10 @@
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
 # OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 # ==================================================================================================
-
-
 import logging

 import nni
+
 from .recoverable import Recoverable

 _logger = logging.getLogger(__name__)

--- a/tools/nni_annotation/.gitignore
+++ b/tools/nni_annotation/.gitignore
+_generated
--- a/tools/nni_annotation/test_annotation.py
+++ b/tools/nni_annotation/test_annotation.py
@@ -39,17 +39,18 @@ class AnnotationTestCase(TestCase):
            shutil.rmtree('_generated')

    def test_search_space_generator(self):
-        search_space = generate_search_space('testcase/annotated')
+        shutil.copytree('testcase/annotated', '_generated/annotated')
+        search_space = generate_search_space('_generated/annotated')
        with open('testcase/searchspace.json') as f:
            self.assertEqual(search_space, json.load(f))

    def test_code_generator(self):
-        code_dir = expand_annotations('testcase/usercode', '_generated', nas_mode='classic_mode')
-        self.assertEqual(code_dir, '_generated')
-        self._assert_source_equal('testcase/annotated/nas.py', '_generated/nas.py')
-        self._assert_source_equal('testcase/annotated/mnist.py', '_generated/mnist.py')
-        self._assert_source_equal('testcase/annotated/dir/simple.py', '_generated/dir/simple.py')
-        with open('testcase/usercode/nonpy.txt') as src, open('_generated/nonpy.txt') as dst:
+        code_dir = expand_annotations('testcase/usercode', '_generated/usercode', nas_mode='classic_mode')
+        self.assertEqual(code_dir, '_generated/usercode')
+        self._assert_source_equal('testcase/annotated/nas.py', '_generated/usercode/nas.py')
+        self._assert_source_equal('testcase/annotated/mnist.py', '_generated/usercode/mnist.py')
+        self._assert_source_equal('testcase/annotated/dir/simple.py', '_generated/usercode/dir/simple.py')
+        with open('testcase/usercode/nonpy.txt') as src, open('_generated/usercode/nonpy.txt') as dst:
            assert src.read() == dst.read()

    def test_annotation_detecting(self):

--- a/tools/nni_cmd/config_schema.py
+++ b/tools/nni_cmd/config_schema.py
@@ -142,6 +142,24 @@ tuner_schema_dict = {
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), 
        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
    },
+    'PPOTuner': {
+        'builtinTunerName': 'PPOTuner',
+        'classArgs': {
+            'optimize_mode': setChoice('optimize_mode', 'maximize', 'minimize'),
+            Optional('trials_per_update'): setNumberRange('trials_per_update', int, 0, 99999),
+            Optional('epochs_per_update'): setNumberRange('epochs_per_update', int, 0, 99999),
+            Optional('minibatch_size'): setNumberRange('minibatch_size', int, 0, 99999),
+            Optional('ent_coef'): setType('ent_coef', float),
+            Optional('lr'): setType('lr', float),
+            Optional('vf_coef'): setType('vf_coef', float),
+            Optional('max_grad_norm'): setType('max_grad_norm', float),
+            Optional('gamma'): setType('gamma', float),
+            Optional('lam'): setType('lam', float),
+            Optional('cliprange'): setType('cliprange', float),
+        },
+        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
+        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+    },
    'customized': {
        'codeDir': setPathCheck('codeDir'),
        'classFileName': setType('classFileName', str),