NNI dogfood version 1

252f36f8 · Deshui Yu · 781cea26 · 252f36f8 · 252f36f8 · 252f36f8
Commit 252f36f8 authored Aug 20, 2018 by Deshui Yu
20 changed files
--- a/examples/trials/ga_squad/evaluate.py
+++ b/examples/trials/ga_squad/evaluate.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+def normalize_answer(str_input):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        '''
+        Remove "a|an|the"
+        '''
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text):
+        '''
+        Remove unnessary whitespace
+        '''
+        return ' '.join(text.split())
+    def remove_punc(text):
+        '''
+        Remove punc
+        '''
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        '''
+        Change string to lower form.
+        '''
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(str_input))))
+def f1_score(prediction, ground_truth):
+    '''
+    Calculate the f1 score.
+    '''
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def exact_match_score(prediction, ground_truth):
+    '''
+    Calculate the match score with prediction and ground truth.
+    '''
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    '''
+    Metric max over the ground truths.
+    '''
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def _evaluate(dataset, predictions):
+    '''
+    Evaluate function.
+    '''
+    f1 = exact_match = total = 0
+    count = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    #print(message, file=sys.stderr)
+                    count += 1
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(
+                    f1_score, prediction, ground_truths)
+    print('total', total, 'exact_match', exact_match, 'unanswer_question ', count)
+    exact_match = 100.0 * exact_match / total
+    f1 = 100.0 * f1 / total
+    return {'exact_match': exact_match, 'f1': f1}
+def evaluate(data_file, pred_file):
+    '''
+    Evaluate.
+    '''
+    expected_version = '1.1'
+    with open(data_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if dataset_json['version'] != expected_version:
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(pred_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    # print(json.dumps(evaluate(dataset, predictions)))
+    result = _evaluate(dataset, predictions)
+    # print('em:', result['exact_match'], 'f1:', result['f1'])
+    return result['exact_match']
+def evaluate_with_predictions(data_file, predictions):
+    '''
+    Evalutate with predictions/
+    '''
+    expected_version = '1.1'
+    with open(data_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if dataset_json['version'] != expected_version:
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    result = _evaluate(dataset, predictions)
+    return result['exact_match']
+if __name__ == '__main__':
+    EXPECT_VERSION = '1.1'
+    parser = argparse.ArgumentParser(
+        description='Evaluation for SQuAD ' + EXPECT_VERSION)
+    parser.add_argument('dataset_file', help='Dataset file')
+    parser.add_argument('prediction_file', help='Prediction File')
+    args = parser.parse_args()
+    print(evaluate(args.dataset_file, args.prediction_file))
--- a/examples/trials/ga_squad/graph.py
+++ b/examples/trials/ga_squad/graph.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+'''
+Graph is customed-define class, this module contains related class and function about graph.
+'''
+import copy
+import json
+import random
+from enum import Enum, unique
+@unique
+class LayerType(Enum):
+    '''
+    Layer type
+    '''
+    attention = 0
+    self_attention = 1
+    rnn = 2
+    input = 3
+    output = 4
+class Layer(object):
+    '''
+    Layer class, which contains the information of graph.
+    '''
+    def __init__(self, graph_type, input=None, output=None, size=None):
+        self.input = input if input is not None else []
+        self.output = output if output is not None else []
+        self.graph_type = graph_type
+        self.is_delete = False
+        self.size = size
+        if graph_type == LayerType.attention.value:
+            self.input_size = 2
+            self.output_size = 1
+        elif graph_type == LayerType.rnn.value:
+            self.input_size = 1
+            self.output_size = 1
+        elif graph_type == LayerType.self_attention.value:
+            self.input_size = 1
+            self.output_size = 1
+        elif graph_type == LayerType.input.value:
+            self.input_size = 0
+            self.output_size = 1
+        elif graph_type == LayerType.output.value:
+            self.input_size = 1
+            self.output_size = 0
+        else:
+            print(graph_type)
+    def set_size(self, graph_id, size):
+        '''
+        Set size.
+        '''
+        if self.graph_type == LayerType.attention.value:
+            if self.input[0] == graph_id:
+                self.size = size
+        if self.graph_type == LayerType.rnn.value:
+            self.size = size
+        if self.graph_type == LayerType.self_attention.value:
+            self.size = size
+        if self.graph_type == LayerType.output.value:
+            if self.size != size:
+                return False
+        return True
+    def clear_size(self):
+        '''
+        Clear size
+        '''
+        if self.graph_type == LayerType.attention.value or \
+            LayerType.rnn.value or LayerType.self_attention.value:
+            self.size = None
+    def __str__(self):
+        return 'input:' + str(self.input) + ' output:' + str(self.output) + ' type:' + str(
+            self.graph_type) + ' is_delete:' + str(self.is_delete) + ' size:' + str(self.size)
+def graph_dumps(graph):
+    '''
+    Dump the graph.
+    '''
+    return json.dumps(graph, default=lambda obj: obj.__dict__)
+def graph_loads(graph_json):
+    '''
+    Load graph
+    '''
+    layers = []
+    for layer in graph_json['layers']:
+        layer_info = Layer(layer['type'], layer['input'], layer['output'], layer['size'])
+        layer_info.is_delete = layer['is_delete']
+        layers.append(layer_info)
+    graph = Graph(graph_json['max_layer_num'], [], [], [])
+    graph.layers = layers
+    return graph
+class Graph(object):
+    '''
+    Customed Graph class.
+    '''
+    def __init__(self, max_layer_num, input, output, hide):
+        self.layers = []
+        self.max_layer_num = max_layer_num
+        for layer in input:
+            self.layers.append(layer)
+        for layer in output:
+            self.layers.append(layer)
+        if hide is not None:
+            for layer in hide:
+                self.layers.append(layer)
+        assert self.is_legal()
+    def is_topology(self, layers=None):
+        '''
+        valid the topology
+        '''
+        if layers is None:
+            layers = self.layers
+        layers_nodle = []
+        result = []
+        for i, layer in enumerate(layers):
+            if layer.is_delete is False:
+                layers_nodle.append(i)
+        while True:
+            flag_break = True
+            layers_toremove = []
+            for layer1 in layers_nodle:
+                flag_arrive = True
+                for layer2 in layers[layer1].input:
+                    if layer2 in layers_nodle:
+                        flag_arrive = False
+                if flag_arrive is True:
+                    for layer2 in layers[layer1].output:
+                        # Size is error
+                        if layers[layer2].set_size(layer1, layers[layer1].size) is False:
+                            return False
+                    layers_toremove.append(layer1)
+                    result.append(layer1)
+                    flag_break = False
+            for layer in layers_toremove:
+                layers_nodle.remove(layer)
+            result.append('|')
+            if flag_break:
+                break
+        # There is loop in graph || some layers can't to arrive
+        if layers_nodle:
+            return False
+        return result
+    def layer_num(self, layers=None):
+        '''
+        Reutn number of layer.
+        '''
+        if layers is None:
+            layers = self.layers
+        layer_num = 0
+        for layer in layers:
+            if layer.is_delete is False and layer.graph_type != LayerType.input.value\
+                and layer.graph_type != LayerType.output.value:
+                layer_num += 1
+        return layer_num
+    def is_legal(self, layers=None):
+        '''
+        Judge whether is legal for layers
+        '''
+        if layers is None:
+            layers = self.layers
+        for layer in layers:
+            if layer.is_delete is False:
+                if len(layer.input) != layer.input_size:
+                    return False
+                if len(layer.output) < layer.output_size:
+                    return False
+        # layer_num <= max_layer_num
+        if self.layer_num(layers) > self.max_layer_num:
+            return False
+        # There is loop in graph || some layers can't to arrive
+        if self.is_topology(layers) is False:
+            return False
+        return True
+    def mutation(self, only_add=False):
+        '''
+        Mutation for a graph
+        '''
+        types = []
+        if self.layer_num() < self.max_layer_num:
+            types.append(0)
+            types.append(1)
+        if self.layer_num() > 5 and only_add is False:
+            types.append(2)
+            types.append(3)
+        # 0 : add a layer , delete a edge
+        # 1 : add a layer , change a edge
+        # 2 : delete a layer, delete a edge
+        # 3 : delete a layer, change a edge
+        graph_type = random.choice(types)
+        layer_type = random.choice([LayerType.attention.value,\
+            LayerType.self_attention.value, LayerType.rnn.value])
+        layers = copy.deepcopy(self.layers)
+        cnt_try = 0
+        while True:
+            layers_in = []
+            layers_out = []
+            layers_del = []
+            for i, layer in enumerate(layers):
+                if layer.is_delete is False:
+                    if layer.graph_type != LayerType.output.value:
+                        layers_in.append(i)
+                    if layer.graph_type != LayerType.input.value:
+                        layers_out.append(i)
+                    if layer.graph_type != LayerType.output.value\
+                            and layer.graph_type != LayerType.input.value:
+                        layers_del.append(i)
+            if graph_type <= 1:
+                new_id = len(layers)
+                out = random.choice(layers_out)
+                input = []
+                output = [out]
+                pos = random.randint(0, len(layers[out].input) - 1)
+                last_in = layers[out].input[pos]
+                layers[out].input[pos] = new_id
+                if graph_type == 0:
+                    layers[last_in].output.remove(out)
+                if graph_type == 1:
+                    layers[last_in].output.remove(out)
+                    layers[last_in].output.append(new_id)
+                    input = [last_in]
+                lay = Layer(graph_type=layer_type, input=input, output=output)
+                while len(input) < lay.input_size:
+                    layer1 = random.choice(layers_in)
+                    input.append(layer1)
+                    layers[layer1].output.append(new_id)
+                lay.input = input
+                layers.append(lay)
+            else:
+                layer1 = random.choice(layers_del)
+                for layer2 in layers[layer1].output:
+                    layers[layer2].input.remove(layer1)
+                    if graph_type == 2:
+                        random_in = random.choice(layers_in)
+                    else:
+                        random_in = random.choice(layers[layer1].input)
+                    layers[layer2].input.append(random_in)
+                    layers[random_in].output.append(layer2)
+                for layer2 in layers[layer1].input:
+                    layers[layer2].output.remove(layer1)
+                layers[layer1].is_delete = True
+            if self.is_legal(layers):
+                self.layers = layers
+                break
+            else:
+                layers = copy.deepcopy(self.layers)
+                cnt_try += 1
+    def __str__(self):
+        info = ""
+        for l_id, layer in enumerate(self.layers):
+            if layer.is_delete is False:
+                info += 'id:%d ' % l_id + str(layer) + '\n'
+        return info
--- a/examples/trials/ga_squad/graph_to_tf.py
+++ b/examples/trials/ga_squad/graph_to_tf.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import tensorflow as tf
+from rnn import XGRUCell
+from util import dropout
+from graph import LayerType
+def normalize(inputs,
+              epsilon=1e-8,
+              scope="ln"):
+    '''Applies layer normalization.
+    Args:
+      inputs: A tensor with 2 or more dimensions, where the first dimension has
+        `batch_size`.
+      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
+      scope: Optional scope for `variable_scope`.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+    Returns:
+      A tensor with the same shape and data dtype as `inputs`.
+    '''
+    with tf.variable_scope(scope):
+        inputs_shape = inputs.get_shape()
+        params_shape = inputs_shape[-1:]
+        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
+        beta = tf.Variable(tf.zeros(params_shape))
+        gamma = tf.Variable(tf.ones(params_shape))
+        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
+        outputs = gamma * normalized + beta
+    return outputs
+def multihead_attention(queries,
+                        keys,
+                        scope="multihead_attention",
+                        num_units=None,
+                        num_heads=4,
+                        dropout_rate=0,
+                        is_training=True,
+                        causality=False):
+    '''Applies multihead attention.
+    Args:
+      queries: A 3d tensor with shape of [N, T_q, C_q].
+      keys: A 3d tensor with shape of [N, T_k, C_k].
+      num_units: A cdscalar. Attention size.
+      dropout_rate: A floating point number.
+      is_training: Boolean. Controller of mechanism for dropout.
+      causality: Boolean. If true, units that reference the future are masked.
+      num_heads: An int. Number of heads.
+      scope: Optional scope for `variable_scope`.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+    Returns
+      A 3d tensor with shape of (N, T_q, C)
+    '''
+    global look5
+    with tf.variable_scope(scope):
+        # Set the fall back option for num_units
+        if num_units is None:
+            num_units = queries.get_shape().as_list()[-1]
+        Q_ = []
+        K_ = []
+        V_ = []
+        for _ in range(num_heads):
+            Q = tf.layers.dense(queries, num_units / num_heads,
+                                activation=tf.nn.relu)  # (N, T_q, C)
+            K = tf.layers.dense(keys, num_units / num_heads,
+                                activation=tf.nn.relu)  # (N, T_k, C)
+            V = tf.layers.dense(keys, num_units / num_heads,
+                                activation=tf.nn.relu)  # (N, T_k, C)
+            Q_.append(Q)
+            K_.append(K)
+            V_.append(V)
+        # Split and concat
+        Q_ = tf.concat(Q_, axis=0)  # (h*N, T_q, C/h)
+        K_ = tf.concat(K_, axis=0)  # (h*N, T_k, C/h)
+        V_ = tf.concat(V_, axis=0)  # (h*N, T_k, C/h)
+        # Multiplication
+        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)
+        # Scale
+        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
+        # Key Masking
+        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
+        key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
+        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
+                            [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)
+        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
+        outputs = tf.where(tf.equal(key_masks, 0), paddings,
+                           outputs)  # (h*N, T_q, T_k)
+        # Causality = Future blinding
+        if causality:
+            diag_vals = tf.ones_like(outputs[0, :, :])  # (T_q, T_k)
+            tril = tf.contrib.linalg.LinearOperatorTriL(
+                diag_vals).to_dense()  # (T_q, T_k)
+            masks = tf.tile(tf.expand_dims(tril, 0),
+                            [tf.shape(outputs)[0], 1, 1])  # (h*N, T_q, T_k)
+            paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
+            outputs = tf.where(tf.equal(masks, 0), paddings,
+                               outputs)  # (h*N, T_q, T_k)
+        # Activation
+        look5 = outputs
+        outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)
+        # Query Masking
+        query_masks = tf.sign(
+            tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
+        query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
+        query_masks = tf.tile(tf.expand_dims(
+            query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
+        outputs *= query_masks  # broadcasting. (N, T_q, C)
+        # Dropouts
+        outputs = dropout(outputs, dropout_rate, is_training)
+        # Weighted sum
+        outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)
+        # Restore shape
+        outputs = tf.concat(tf.split(outputs, num_heads,
+                                     axis=0), axis=2)  # (N, T_q, C)
+        # Residual connection
+        if queries.get_shape().as_list()[-1] == num_units:
+            outputs += queries
+        # Normalize
+        outputs = normalize(outputs, scope=scope)  # (N, T_q, C)
+    return outputs
+def positional_encoding(inputs,
+                        num_units=None,
+                        zero_pad=True,
+                        scale=True,
+                        scope="positional_encoding",
+                        reuse=None):
+    '''
+    Return positinal embedding.
+    '''
+    Shape = tf.shape(inputs)
+    N = Shape[0]
+    T = Shape[1]
+    num_units = Shape[2]
+    with tf.variable_scope(scope, reuse=reuse):
+        position_ind = tf.tile(tf.expand_dims(tf.range(T), 0), [N, 1])
+        # First part of the PE function: sin and cos argument
+        #  Second part, apply the cosine to even columns and sin to odds.
+        X = tf.expand_dims(tf.cast(tf.range(T), tf.float32), axis=1)
+        Y = tf.expand_dims(
+            tf.cast(10000 ** -(2 * tf.range(num_units) / num_units), tf.float32), axis=0)
+        h1 = tf.cast((tf.range(num_units) + 1) % 2, tf.float32)
+        h2 = tf.cast((tf.range(num_units) % 2), tf.float32)
+        position_enc = tf.multiply(X, Y)
+        position_enc = tf.sin(position_enc) * tf.multiply(tf.ones_like(X), h1) + \
+            tf.cos(position_enc) * tf.multiply(tf.ones_like(X), h2)
+        # Convert to a tensor
+        lookup_table = position_enc
+        if zero_pad:
+            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
+                                      lookup_table[1:, :]), 0)
+        outputs = tf.nn.embedding_lookup(lookup_table, position_ind)
+        if scale:
+            outputs = outputs * tf.sqrt(tf.cast(num_units, tf.float32))
+        return outputs
+def feedforward(inputs,
+                num_units,
+                scope="multihead_attention"):
+    '''Point-wise feed forward net.
+    Args:
+      inputs: A 3d tensor with shape of [N, T, C].
+      num_units: A list of two integers.
+      scope: Optional scope for `variable_scope`.
+      reuse: Boolean, whether to reuse the weights of a previous layer
+        by the same name.
+    Returns:
+      A 3d tensor with the same shape and dtype as inputs
+    '''
+    with tf.variable_scope(scope):
+        # Inner layer
+        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
+                  "activation": tf.nn.relu, "use_bias": True}
+        outputs = tf.layers.conv1d(**params)
+        # Readout layer
+        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
+                  "activation": None, "use_bias": True}
+        outputs = tf.layers.conv1d(**params)
+        # Residual connection
+        outputs += inputs
+        # Normalize
+        outputs = normalize(outputs)
+    return outputs
+def rnn(input_states, sequence_lengths, dropout_rate, is_training, num_units):
+    layer_cnt = 1
+    states = []
+    xs = tf.transpose(input_states, perm=[1, 0, 2])
+    for i in range(0, layer_cnt):
+        xs = dropout(xs, dropout_rate, is_training)
+        with tf.variable_scope('layer_' + str(i)):
+            cell_fw = XGRUCell(num_units)
+            cell_bw = XGRUCell(num_units)
+            outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+                cell_fw=cell_fw,
+                cell_bw=cell_bw,
+                dtype=tf.float32,
+                sequence_length=sequence_lengths,
+                inputs=xs,
+                time_major=True)
+        y_lr, y_rl = outputs
+        xs = tf.concat([y_lr, y_rl], 2)
+        states.append(xs)
+    return tf.transpose(dropout(tf.concat(states, axis=2),
+                                dropout_rate,
+                                is_training), perm=[1, 0, 2])
+def graph_to_network(input1,
+                     input2,
+                     input1_lengths,
+                     input2_lengths,
+                     graph,
+                     dropout_rate,
+                     is_training,
+                     num_heads=1,
+                     rnn_units=256):
+    topology = graph.is_topology()
+    layers = dict()
+    layers_sequence_lengths = dict()
+    num_units = input1.get_shape().as_list()[-1]
+    layers[0] = input1*tf.sqrt(tf.cast(num_units, tf.float32)) + \
+        positional_encoding(input1, scale=False, zero_pad=False)
+    layers[1] = input2*tf.sqrt(tf.cast(num_units, tf.float32))
+    layers[0] = dropout(layers[0], dropout_rate, is_training)
+    layers[1] = dropout(layers[1], dropout_rate, is_training)
+    layers_sequence_lengths[0] = input1_lengths
+    layers_sequence_lengths[1] = input2_lengths
+    for _, topo_i in enumerate(topology):
+        if topo_i == '|':
+            continue
+        if graph.layers[topo_i].graph_type == LayerType.input.value:
+            continue
+        elif graph.layers[topo_i].graph_type == LayerType.attention.value:
+            with tf.variable_scope('attation_%d' % topo_i):
+                layer = multihead_attention(layers[graph.layers[topo_i].input[0]],
+                                            layers[graph.layers[topo_i].input[1]],
+                                            scope="multihead_attention%d" % topo_i,
+                                            dropout_rate=dropout_rate,
+                                            is_training=is_training,
+                                            num_heads=num_heads,
+                                            num_units=rnn_units * 2)
+                layer = feedforward(layer, scope="feedforward%d" % topo_i,
+                                    num_units=[rnn_units * 2 * 4, rnn_units * 2])
+            layers[topo_i] = layer
+            layers_sequence_lengths[topo_i] = layers_sequence_lengths[
+                graph.layers[topo_i].input[0]]
+        elif graph.layers[topo_i].graph_type == LayerType.self_attention.value:
+            with tf.variable_scope('self-attation_%d' % topo_i):
+                layer = multihead_attention(layers[graph.layers[topo_i].input[0]],
+                                            layers[graph.layers[topo_i].input[0]],
+                                            scope="multihead_attention%d" % topo_i,
+                                            dropout_rate=dropout_rate,
+                                            is_training=is_training,
+                                            num_heads=num_heads,
+                                            num_units=rnn_units * 2)
+                layer = feedforward(layer, scope="feedforward%d" % topo_i,
+                                    num_units=[rnn_units * 2 * 4, rnn_units * 2])
+            layers[topo_i] = layer
+            layers_sequence_lengths[topo_i] = layers_sequence_lengths[
+                graph.layers[topo_i].input[0]]
+        elif graph.layers[topo_i].graph_type == LayerType.rnn.value:
+            with tf.variable_scope('rnn_%d' % topo_i):
+                layer = rnn(layers[graph.layers[topo_i].input[0]],
+                            layers_sequence_lengths[graph.layers[topo_i].input[0]],
+                            dropout_rate,
+                            is_training,
+                            rnn_units)
+            layers[topo_i] = layer
+            layers_sequence_lengths[topo_i] = layers_sequence_lengths[
+                graph.layers[topo_i].input[0]]
+        elif graph.layers[topo_i].graph_type == LayerType.output.value:
+            layers[topo_i] = layers[graph.layers[topo_i].input[0]]
+            if layers[topo_i].get_shape().as_list()[-1] != rnn_units * 1 * 2:
+                with tf.variable_scope('add_dense'):
+                    layers[topo_i] = tf.layers.dense(
+                        layers[topo_i], units=rnn_units*2)
+    return layers[2], layers[3]
--- a/examples/trials/ga_squad/readme.md
+++ b/examples/trials/ga_squad/readme.md
+## How to download data
+1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/
+2. download "glove.840B.300d.txt" in "https://nlp.stanford.edu/projects/glove/"
+## How to submit this job
+1. run "$NNI_ROOT_DIR/auto_run.py" as "$NNI_ROOT_DIR/README-AUTO.md" said.
+2. use the dockerImage openpai.azurecr.io/nni_v0.0.1, which means it use a tensorflow cpu-version.
+3. this model don't need search_space.json.
\ No newline at end of file
--- a/examples/trials/ga_squad/requirements.txt
+++ b/examples/trials/ga_squad/requirements.txt
+tensorflow==1.4.0
\ No newline at end of file
--- a/examples/trials/ga_squad/rnn.py
+++ b/examples/trials/ga_squad/rnn.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import tensorflow as tf
+from tensorflow.python.ops.rnn_cell_impl import RNNCell
+class GRU:
+    '''
+    GRU class.
+    '''
+    def __init__(self, name, input_dim, hidden_dim):
+        self.name = '/'.join([name, 'gru'])
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.w_matrix = None
+        self.U = None
+        self.bias = None
+    def define_params(self):
+        '''
+        Define parameters.
+        '''
+        input_dim = self.input_dim
+        hidden_dim = self.hidden_dim
+        prefix = self.name
+        self.w_matrix = tf.Variable(tf.random_normal([input_dim, 3 * hidden_dim], stddev=0.1),
+                                    name='/'.join([prefix, 'W']))
+        self.U = tf.Variable(tf.random_normal([hidden_dim, 3 * hidden_dim], stddev=0.1),
+                             name='/'.join([prefix, 'U']))
+        self.bias = tf.Variable(tf.random_normal([1, 3 * hidden_dim], stddev=0.1),
+                                name='/'.join([prefix, 'b']))
+        return self
+    def build(self, x, h, mask=None):
+        '''
+        Build the GRU cell.
+        '''
+        xw = tf.split(tf.matmul(x, self.w_matrix) + self.bias, 3, 1)
+        hu = tf.split(tf.matmul(h, self.U), 3, 1)
+        r = tf.sigmoid(xw[0] + hu[0])
+        z = tf.sigmoid(xw[1] + hu[1])
+        h1 = tf.tanh(xw[2] + r * hu[2])
+        next_h = h1 * (1 - z) + h * z
+        if mask is not None:
+            next_h = next_h * mask + h * (1 - mask)
+        return next_h
+    def build_sequence(self, xs, masks, init, is_left_to_right):
+        '''
+        Build GRU sequence.
+        '''
+        states = []
+        last = init
+        if is_left_to_right:
+            for i, xs_i in enumerate(xs):
+                h = self.build(xs_i, last, masks[i])
+                states.append(h)
+                last = h
+        else:
+            for i in range(len(xs) - 1, -1, -1):
+                h = self.build(xs[i], last, masks[i])
+                states.insert(0, h)
+                last = h
+        return states
+class XGRUCell(RNNCell):
+    def __init__(self, hidden_dim, reuse=None):
+        super(XGRUCell, self).__init__(self, _reuse=reuse)
+        self._num_units = hidden_dim
+        self._activation = tf.tanh
+    @property
+    def state_size(self):
+        return self._num_units
+    @property
+    def output_size(self):
+        return self._num_units
+    def call(self, inputs, state):
+        input_dim = inputs.get_shape()[-1]
+        assert input_dim is not None, "input dimension must be defined"
+        W = tf.get_variable(
+            name="W", shape=[input_dim, 3 * self._num_units], dtype=tf.float32)
+        U = tf.get_variable(
+            name='U', shape=[self._num_units, 3 * self._num_units], dtype=tf.float32)
+        b = tf.get_variable(
+            name='b', shape=[1, 3 * self._num_units], dtype=tf.float32)
+        xw = tf.split(tf.matmul(inputs, W) + b, 3, 1)
+        hu = tf.split(tf.matmul(state, U), 3, 1)
+        r = tf.sigmoid(xw[0] + hu[0])
+        z = tf.sigmoid(xw[1] + hu[1])
+        h1 = self._activation(xw[2] + r * hu[2])
+        next_h = h1 * (1 - z) + state * z
+        return next_h, next_h
--- a/examples/trials/ga_squad/train_model.py
+++ b/examples/trials/ga_squad/train_model.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+'''
+Train the network combined by RNN and attention.
+'''
+import tensorflow as tf
+from attention import DotAttention
+from rnn import XGRUCell
+from util import dropout
+from graph_to_tf import graph_to_network
+class GAGConfig:
+    def __init__(self):
+        self.batch_size = 128
+        self.dropout = 0.1
+        self.char_vcb_size = 1371
+        self.max_char_length = 20
+        self.char_embed_dim = 100
+        self.max_query_length = 40
+        self.max_passage_length = 800
+        self.att_is_vanilla = True
+        self.att_need_padding = False
+        self.att_is_id = False
+        self.ptr_dim = 70
+        self.learning_rate = 0.1
+        self.labelsmoothing = 0.1
+        self.num_heads = 1
+        self.rnn_units = 256
+class GAG:
+    def __init__(self, cfg, embed, graph):
+        self.cfg = cfg
+        self.embed = embed
+        self.graph = graph
+        self.query_word = None
+        self.query_mask = None
+        self.query_lengths = None
+        self.passage_word = None
+        self.passage_mask = None
+        self.passage_lengths = None
+        self.answer_begin = None
+        self.answer_end = None
+        self.query_char_ids = None
+        self.query_char_lengths = None
+        self.passage_char_ids = None
+        self.passage_char_lengths = None
+        self.passage_states = None
+        self.query_states = None
+        self.query_init = None
+        self.begin_prob = None
+        self.end_prob = None
+        self.loss = None
+        self.train_op = None
+    def build_net(self, is_training):
+        cfg = self.cfg
+        with tf.device('/cpu:0'):
+            word_embed = tf.get_variable(
+                name='word_embed', initializer=self.embed, dtype=tf.float32, trainable=False)
+            char_embed = tf.get_variable(name='char_embed',
+                                         shape=[cfg.char_vcb_size,
+                                                cfg.char_embed_dim],
+                                         dtype=tf.float32)
+        # [query_length, batch_size]
+        self.query_word = tf.placeholder(dtype=tf.int32,
+                                         shape=[None, None],
+                                         name='query_word')
+        self.query_mask = tf.placeholder(dtype=tf.float32,
+                                         shape=[None, None],
+                                         name='query_mask')
+        # [batch_size]
+        self.query_lengths = tf.placeholder(
+            dtype=tf.int32, shape=[None], name='query_lengths')
+        # [passage_length, batch_size]
+        self.passage_word = tf.placeholder(
+            dtype=tf.int32, shape=[None, None], name='passage_word')
+        self.passage_mask = tf.placeholder(
+            dtype=tf.float32, shape=[None, None], name='passage_mask')
+        # [batch_size]
+        self.passage_lengths = tf.placeholder(
+            dtype=tf.int32, shape=[None], name='passage_lengths')
+        if is_training:
+            self.answer_begin = tf.placeholder(
+                dtype=tf.int32, shape=[None], name='answer_begin')
+            self.answer_end = tf.placeholder(
+                dtype=tf.int32, shape=[None], name='answer_end')
+        self.query_char_ids = tf.placeholder(dtype=tf.int32,
+                                             shape=[
+                                                 self.cfg.max_char_length, None, None],
+                                             name='query_char_ids')
+        # sequence_length, batch_size
+        self.query_char_lengths = tf.placeholder(
+            dtype=tf.int32, shape=[None, None], name='query_char_lengths')
+        self.passage_char_ids = tf.placeholder(dtype=tf.int32,
+                                               shape=[
+                                                   self.cfg.max_char_length, None, None],
+                                               name='passage_char_ids')
+        # sequence_length, batch_size
+        self.passage_char_lengths = tf.placeholder(dtype=tf.int32,
+                                                   shape=[None, None],
+                                                   name='passage_char_lengths')
+        query_char_states = self.build_char_states(char_embed=char_embed,
+                                                   is_training=is_training,
+                                                   reuse=False,
+                                                   char_ids=self.query_char_ids,
+                                                   char_lengths=self.query_char_lengths)
+        passage_char_states = self.build_char_states(char_embed=char_embed,
+                                                     is_training=is_training,
+                                                     reuse=True,
+                                                     char_ids=self.passage_char_ids,
+                                                     char_lengths=self.passage_char_lengths)
+        with tf.variable_scope("encoding") as scope:
+            query_states = tf.concat([tf.nn.embedding_lookup(
+                word_embed, self.query_word), query_char_states], axis=2)
+            scope.reuse_variables()
+            passage_states = tf.concat([tf.nn.embedding_lookup(
+                word_embed, self.passage_word), passage_char_states], axis=2)
+        passage_states = tf.transpose(passage_states, perm=[1, 0, 2])
+        query_states = tf.transpose(query_states, perm=[1, 0, 2])
+        self.passage_states = passage_states
+        self.query_states = query_states
+        output, output2 = graph_to_network(passage_states, query_states,
+                                           self.passage_lengths, self.query_lengths,
+                                           self.graph, self.cfg.dropout,
+                                           is_training, num_heads=cfg.num_heads,
+                                           rnn_units=cfg.rnn_units)
+        passage_att_mask = self.passage_mask
+        batch_size_x = tf.shape(self.query_lengths)
+        answer_h = tf.zeros(
+            tf.concat([batch_size_x, tf.constant([cfg.ptr_dim], dtype=tf.int32)], axis=0))
+        answer_context = tf.reduce_mean(output2, axis=1)
+        query_init_w = tf.get_variable(
+            'query_init_w', shape=[output2.get_shape().as_list()[-1], cfg.ptr_dim])
+        self.query_init = query_init_w
+        answer_context = tf.matmul(answer_context, query_init_w)
+        output = tf.transpose(output, perm=[1, 0, 2])
+        with tf.variable_scope('answer_ptr_layer'):
+            ptr_att = DotAttention('ptr',
+                                   hidden_dim=cfg.ptr_dim,
+                                   is_vanilla=self.cfg.att_is_vanilla,
+                                   is_identity_transform=self.cfg.att_is_id,
+                                   need_padding=self.cfg.att_need_padding)
+            answer_pre_compute = ptr_att.get_pre_compute(output)
+            ptr_gru = XGRUCell(hidden_dim=cfg.ptr_dim)
+            begin_prob, begin_logits = ptr_att.get_prob(output, answer_context, passage_att_mask,
+                                                        answer_pre_compute, True)
+            att_state = ptr_att.get_att(output, begin_prob)
+            (_, answer_h) = ptr_gru.call(inputs=att_state, state=answer_h)
+            answer_context = answer_h
+            end_prob, end_logits = ptr_att.get_prob(output, answer_context,
+                                                    passage_att_mask, answer_pre_compute,
+                                                    True)
+        self.begin_prob = tf.transpose(begin_prob, perm=[1, 0])
+        self.end_prob = tf.transpose(end_prob, perm=[1, 0])
+        begin_logits = tf.transpose(begin_logits, perm=[1, 0])
+        end_logits = tf.transpose(end_logits, perm=[1, 0])
+        if is_training:
+            def label_smoothing(inputs, masks, epsilon=0.1):
+                epsilon = cfg.labelsmoothing
+                num_of_channel = tf.shape(inputs)[-1]  # number of channels
+                inputs = tf.cast(inputs, tf.float32)
+                return (((1 - epsilon) * inputs) + (epsilon /
+                                                    tf.cast(num_of_channel, tf.float32))) * masks
+            cost1 = tf.reduce_mean(
+                tf.losses.softmax_cross_entropy(label_smoothing(
+                    tf.one_hot(self.answer_begin,
+                               depth=tf.shape(self.passage_word)[0]),
+                    tf.transpose(self.passage_mask, perm=[1, 0])), begin_logits))
+            cost2 = tf.reduce_mean(
+                tf.losses.softmax_cross_entropy(
+                    label_smoothing(tf.one_hot(self.answer_end,
+                                               depth=tf.shape(self.passage_word)[0]),
+                                    tf.transpose(self.passage_mask, perm=[1, 0])), end_logits))
+            reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+            l2_loss = tf.reduce_sum(reg_ws)
+            loss = cost1 + cost2 + l2_loss
+            self.loss = loss
+            optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate)
+            self.train_op = optimizer.minimize(self.loss)
+        return tf.stack([self.begin_prob, self.end_prob])
+    def build_char_states(self, char_embed, is_training, reuse, char_ids, char_lengths):
+        max_char_length = self.cfg.max_char_length
+        inputs = dropout(tf.nn.embedding_lookup(char_embed, char_ids),
+                         self.cfg.dropout, is_training)
+        inputs = tf.reshape(
+            inputs, shape=[max_char_length, -1, self.cfg.char_embed_dim])
+        char_lengths = tf.reshape(char_lengths, shape=[-1])
+        with tf.variable_scope('char_encoding', reuse=reuse):
+            cell_fw = XGRUCell(hidden_dim=self.cfg.char_embed_dim)
+            cell_bw = XGRUCell(hidden_dim=self.cfg.char_embed_dim)
+            _, (left_right, right_left) = tf.nn.bidirectional_dynamic_rnn(
+                cell_fw=cell_fw,
+                cell_bw=cell_bw,
+                sequence_length=char_lengths,
+                inputs=inputs,
+                time_major=True,
+                dtype=tf.float32
+            )
+        left_right = tf.reshape(left_right, shape=[-1, self.cfg.char_embed_dim])
+        right_left = tf.reshape(right_left, shape=[-1, self.cfg.char_embed_dim])
+        states = tf.concat([left_right, right_left], axis=1)
+        out_shape = tf.shape(char_ids)[1:3]
+        out_shape = tf.concat([out_shape, tf.constant(
+            value=[self.cfg.char_embed_dim * 2], dtype=tf.int32)], axis=0)
+        return tf.reshape(states, shape=out_shape)
--- a/examples/trials/ga_squad/trial.py
+++ b/examples/trials/ga_squad/trial.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import os
+import logging
+logger = logging.getLogger('ga_squad')
+try:
+    import argparse
+    import heapq
+    import json
+    import numpy as np
+    import pickle
+    import graph
+    from util import Timer
+    import nni
+    import data
+    import evaluate
+    from train_model import *
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+except:
+    logger.exception('Catch exception in trial.py.')
+    raise
+def get_config():
+    '''
+    Get config from arument parser.
+    '''
+    parser = argparse.ArgumentParser(
+        description='This program is using genetic algorithm to search architecture for SQuAD.')
+    parser.add_argument('--input_file', type=str,
+                        default='./dev-v1.1.json', help='input file')
+    parser.add_argument('--dev_file', type=str,
+                        default='./dev-v1.1.json', help='dev file')
+    parser.add_argument('--embedding_file', type=str,
+                        default='./glove.840B.300d.txt', help='dev file')
+    parser.add_argument('--root_path', default='./data/',
+                        type=str, help='Root path of models')
+    parser.add_argument('--batch_size', type=int, default=2, help='batch size')
+    parser.add_argument('--save_path', type=str,
+                        default='./save', help='save path dir')
+    parser.add_argument('--learning_rate', type=float, default=0.0001,
+                        help='set half of original learning rate reload data and train.')
+    parser.add_argument('--max_epoch', type=int, default=30)
+    parser.add_argument('--dropout_rate', type=float,
+                        default=0.1, help='dropout_rate')
+    parser.add_argument('--labelsmoothing', type=float,
+                        default=0.1, help='labelsmoothing')
+    parser.add_argument('--num_heads', type=int, default=1, help='num_heads')
+    parser.add_argument('--rnn_units', type=int, default=256, help='rnn_units')
+    args = parser.parse_args()
+    return args
+def get_id(word_dict, word):
+    '''
+    Return word id.
+    '''
+    if word in word_dict.keys():
+        return word_dict[word]
+    return word_dict['<unk>']
+def load_embedding(path):
+    '''
+    return embedding for a specif file by given file path.
+    '''
+    embedding_dict = {}
+    with open(path, 'r', encoding='utf-8') as file:
+        pairs = [line.strip('\r\n').split() for line in file.readlines()]
+        for pair in pairs:
+            embedding_dict[pair[0]] = [float(x) for x in pair[1:]]
+    logger.debug('embedding_dict size: %d', len(embedding_dict))
+    return embedding_dict
+class MaxQueue:
+    '''
+    Queue for max value.
+    '''
+    def __init__(self, capacity):
+        assert capacity > 0, 'queue size must be larger than 0'
+        self._capacity = capacity
+        self._entries = []
+    @property
+    def entries(self):
+        return self._entries
+    @property
+    def capacity(self):
+        return self._capacity
+    @property
+    def size(self):
+        return len(self._entries)
+    def clear(self):
+        self._entries = []
+    def push(self, item):
+        if self.size < self.capacity:
+            heapq.heappush(self.entries, item)
+        else:
+            heapq.heappushpop(self.entries, item)
+def find_best_answer_span(left_prob, right_prob, passage_length, max_answer_length):
+    left = 0
+    right = 0
+    max_prob = left_prob[0] * right_prob[0]
+    for i in range(0, passage_length):
+        left_p = left_prob[i]
+        for j in range(i, min(i + max_answer_length, passage_length)):
+            total_prob = left_p * right_prob[j]
+            if max_prob < total_prob:
+                left, right, max_prob = i, j, total_prob
+    return [(max_prob, left, right)]
+def write_prediction(path, position1_result, position2_result):
+    import codecs
+    with codecs.open(path, 'w', encoding='utf8') as file:
+        batch_num = len(position1_result)
+        for i in range(batch_num):
+            position1_batch = position1_result[i]
+            position2_batch = position2_result[i]
+            for j in range(position1_batch.shape[0]):
+                file.write(str(position1_batch[j]) +
+                           '\t' + str(position2_batch[j]) + '\n')
+def find_kbest_answer_span(k, left_prob, right_prob, passage_length, max_answer_length):
+    if k == 1:
+        return find_best_answer_span(left_prob, right_prob, passage_length, max_answer_length)
+    queue = MaxQueue(k)
+    for i in range(0, passage_length):
+        left_p = left_prob[i]
+        for j in range(i, min(i + max_answer_length, passage_length)):
+            total_prob = left_p * right_prob[j]
+            queue.push((total_prob, i, j))
+    return list(sorted(queue.entries, key=lambda x: -x[0]))
+def run_epoch(batches, answer_net, is_training):
+    if not is_training:
+        position1_result = []
+        position2_result = []
+        contexts = []
+        ids = []
+    loss_sum = 0
+    timer = Timer()
+    count = 0
+    for batch in batches:
+        used = timer.get_elapsed(False)
+        count += 1
+        qps = batch['qp_pairs']
+        question_tokens = [qp['question_tokens'] for qp in qps]
+        passage_tokens = [qp['passage_tokens'] for qp in qps]
+        context = [(qp['passage'], qp['passage_tokens']) for qp in qps]
+        sample_id = [qp['id'] for qp in qps]
+        _, query, query_mask, query_lengths = data.get_word_input(
+            data=question_tokens, word_dict=word_vcb, embed=embed, embed_dim=cfg.word_embed_dim)
+        _, passage, passage_mask, passage_lengths = data.get_word_input(
+            data=passage_tokens, word_dict=word_vcb, embed=embed, embed_dim=cfg.word_embed_dim)
+        query_char, query_char_lengths = data.get_char_input(
+            data=question_tokens, char_dict=char_vcb, max_char_length=cfg.max_char_length)
+        passage_char, passage_char_lengths = data.get_char_input(
+            data=passage_tokens, char_dict=char_vcb, max_char_length=cfg.max_char_length)
+        if is_training:
+            answer_begin, answer_end = data.get_answer_begin_end(qps)
+        if is_training:
+            feed_dict = {answer_net.query_word: query,
+                         answer_net.query_mask: query_mask,
+                         answer_net.query_lengths: query_lengths,
+                         answer_net.passage_word: passage,
+                         answer_net.passage_mask: passage_mask,
+                         answer_net.passage_lengths: passage_lengths,
+                         answer_net.query_char_ids: query_char,
+                         answer_net.query_char_lengths: query_char_lengths,
+                         answer_net.passage_char_ids: passage_char,
+                         answer_net.passage_char_lengths: passage_char_lengths,
+                         answer_net.answer_begin: answer_begin,
+                         answer_net.answer_end: answer_end}
+            loss, _, = sess.run(
+                [answer_net.loss, answer_net.train_op], feed_dict=feed_dict)
+            if count % 100 == 0:
+                logger.debug('%d %g except:%g, loss:%g' %
+                             (count, used, used / count * len(batches), loss))
+            loss_sum += loss
+        else:
+            feed_dict = {answer_net.query_word: query,
+                         answer_net.query_mask: query_mask,
+                         answer_net.query_lengths: query_lengths,
+                         answer_net.passage_word: passage,
+                         answer_net.passage_mask: passage_mask,
+                         answer_net.passage_lengths: passage_lengths,
+                         answer_net.query_char_ids: query_char,
+                         answer_net.query_char_lengths: query_char_lengths,
+                         answer_net.passage_char_ids: passage_char,
+                         answer_net.passage_char_lengths: passage_char_lengths}
+            position1, position2 = sess.run(
+                [answer_net.begin_prob, answer_net.end_prob], feed_dict=feed_dict)
+            position1_result += position1.tolist()
+            position2_result += position2.tolist()
+            contexts += context
+            ids = np.concatenate((ids, sample_id))
+            if count % 100 == 0:
+                logger.debug('%d %g except:%g' %
+                             (count, used, used / count * len(batches)))
+        if count % 100 == 0:
+            break
+    loss = loss_sum / len(batches)
+    if is_training:
+        return loss
+    return loss, position1_result, position2_result, ids, contexts
+def generate_predict_json(position1_result, position2_result, ids, passage_tokens):
+    '''
+    Generate json by prediction.
+    '''
+    predict_len = len(position1_result)
+    logger.debug('total prediction num is %s', str(predict_len))
+    answers = {}
+    for i in range(predict_len):
+        sample_id = ids[i]
+        passage, tokens = passage_tokens[i]
+        kbest = find_best_answer_span(
+            position1_result[i], position2_result[i], len(tokens), 23)
+        _, start, end = kbest[0]
+        answer = passage[tokens[start]['char_begin']:tokens[end]['char_end']]
+        answers[sample_id] = answer
+    logger.debug('generate predict done.')
+    return answers
+def generate_data(path, tokenizer, char_vcb, word_vcb, is_training=False):
+    '''
+    Generate data
+    '''
+    global root_path
+    qp_pairs = data.load_from_file(path=path, is_training=is_training)
+    tokenized_sent = 0
+    # qp_pairs = qp_pairs[:1000]1
+    for qp_pair in qp_pairs:
+        tokenized_sent += 1
+        data.tokenize(qp_pair, tokenizer, is_training)
+        for word in qp_pair['question_tokens']:
+            word_vcb.add(word['word'])
+            for char in word['word']:
+                char_vcb.add(char)
+        for word in qp_pair['passage_tokens']:
+            word_vcb.add(word['word'])
+            for char in word['word']:
+                char_vcb.add(char)
+    max_query_length = max(len(x['question_tokens']) for x in qp_pairs)
+    max_passage_length = max(len(x['passage_tokens']) for x in qp_pairs)
+    #min_passage_length = min(len(x['passage_tokens']) for x in qp_pairs)
+    cfg.max_query_length = max_query_length
+    cfg.max_passage_length = max_passage_length
+    return qp_pairs
+def train_with_graph(graph, qp_pairs, dev_qp_pairs):
+    '''
+    Train a network from a specific graph.
+    '''
+    global sess
+    with tf.Graph().as_default():
+        train_model = GAG(cfg, embed, graph)
+        train_model.build_net(is_training=True)
+        tf.get_variable_scope().reuse_variables()
+        dev_model = GAG(cfg, embed, graph)
+        dev_model.build_net(is_training=False)
+        with tf.Session() as sess:
+            logger.debug('init variables')
+            init = tf.global_variables_initializer()
+            sess.run(init)
+            # writer = tf.summary.FileWriter('%s/graph/'%execution_path, sess.graph)
+            logger.debug('assign to graph')
+            saver = tf.train.Saver()
+            train_loss = None
+            bestacc = 0
+            patience = 5
+            patience_increase = 2
+            improvement_threshold = 0.995
+            for epoch in range(max_epoch):
+                logger.debug('begin to train')
+                train_batches = data.get_batches(qp_pairs, cfg.batch_size)
+                train_loss = run_epoch(train_batches, train_model, True)
+                logger.debug('epoch ' + str(epoch) +
+                             ' loss: ' + str(train_loss))
+                dev_batches = list(data.get_batches(
+                    dev_qp_pairs, cfg.batch_size))
+                _, position1, position2, ids, contexts = run_epoch(
+                    dev_batches, dev_model, False)
+                answers = generate_predict_json(
+                    position1, position2, ids, contexts)
+                if save_path is not None:
+                    with open(save_path + 'epoch%d.prediction' % epoch, 'w') as file:
+                        json.dump(answers, file)
+                else:
+                    answers = json.dumps(answers)
+                    answers = json.loads(answers)
+                iter = epoch + 1
+                acc = evaluate.evaluate_with_predictions(
+                    args.dev_file, answers)
+                logger.debug('Send intermediate acc: %s', str(acc))
+                nni.report_intermediate_result(acc)
+                logger.debug('Send intermediate result done.')
+                if acc > bestacc:
+                    if acc * improvement_threshold > bestacc:
+                        patience = max(patience, iter * patience_increase)
+                    bestacc = acc
+                    if save_path is not None:
+                        saver.save(sess, save_path + 'epoch%d.model' % epoch)
+                        with open(save_path + 'epoch%d.score' % epoch, 'wb') as file:
+                            pickle.dump(
+                                (position1, position2, ids, contexts), file)
+                logger.debug('epoch %d acc %g bestacc %g' %
+                             (epoch, acc, bestacc))
+                if patience <= iter:
+                    break
+            logger.debug('save done.')
+    return train_loss, bestacc
+embed = None
+char_vcb = None
+tokenizer = None
+word_vcb = None
+def load_data():
+    global embed, char_vcb, tokenizer, word_vcb
+    logger.debug('tokenize data')
+    tokenizer = data.WhitespaceTokenizer()
+    char_set = set()
+    word_set = set()
+    logger.debug('generate train data')
+    qp_pairs = generate_data(input_file, tokenizer,
+                             char_set, word_set, is_training=True)
+    logger.debug('generate dev data')
+    dev_qp_pairs = generate_data(
+        dev_file, tokenizer, char_set, word_set, is_training=False)
+    logger.debug('generate data done.')
+    char_vcb = {char: sample_id for sample_id, char in enumerate(char_set)}
+    word_vcb = {word: sample_id for sample_id, word in enumerate(word_set)}
+    timer.start()
+    logger.debug('read embedding table')
+    cfg.word_embed_dim = 300
+    embed = np.zeros((len(word_vcb), cfg.word_embed_dim), dtype=np.float32)
+    embedding = load_embedding(args.embedding_file)
+    for word, sample_id in enumerate(word_vcb):
+        if word in embedding:
+            embed[sample_id] = embedding[word]
+    # add UNK into dict
+    unk = np.zeros((1, cfg.word_embed_dim), dtype=np.float32)
+    embed = np.concatenate((unk, embed), axis=0)
+    word_vcb = {key: value + 1 for key, value in word_vcb.items()}
+    return qp_pairs, dev_qp_pairs
+if __name__ == '__main__':
+    try:
+        args = get_config()
+        root_path = os.path.expanduser(args.root_path)
+        input_file = os.path.expanduser(args.input_file)
+        dev_file = os.path.expanduser(args.dev_file)
+        save_path = None
+        max_epoch = args.max_epoch
+        cfg = GAGConfig()
+        cfg.batch_size = args.batch_size
+        cfg.learning_rate = float(args.learning_rate)
+        cfg.dropout = args.dropout_rate
+        cfg.rnn_units = args.rnn_units
+        cfg.labelsmoothing = args.labelsmoothing
+        cfg.num_heads = args.num_heads
+        timer = Timer()
+        qp_pairs, dev_qp_pairs = load_data()
+        logger.debug('Init finish.')
+        original_params = nni.get_parameters()
+        '''
+        with open('data.json') as f:
+            original_params = json.load(f)
+        '''
+        try:
+            graph = graph.graph_loads(original_params)
+        except Exception:
+            logger.debug('Can\'t load graph.')
+        train_loss, best_acc = train_with_graph(graph, qp_pairs, dev_qp_pairs)
+        logger.debug('Send best acc: %s', str(best_acc))
+        nni.report_final_result(best_acc)
+        logger.debug('Send final result done')
+    except:
+        logger.exception('Catch exception in trial.py.')
+        raise
--- a/examples/trials/ga_squad/util.py
+++ b/examples/trials/ga_squad/util.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+'''
+Util Module
+'''
+import time
+import tensorflow as tf
+def shape(tensor):
+    '''
+    Get shape of variable.
+    Return type is tuple.
+    '''
+    temp_s = tensor.get_shape()
+    return tuple([temp_s[i].value for i in range(0, len(temp_s))])
+def get_variable(name, temp_s):
+    '''
+    Get variable by name.
+    '''
+    return tf.Variable(tf.zeros(temp_s), name=name)
+def dropout(tensor, drop_prob, is_training):
+    '''
+    Dropout except test.
+    '''
+    if not is_training:
+        return tensor
+    return tf.nn.dropout(tensor, 1.0 - drop_prob)
+class Timer:
+    '''
+    Class Timer is for calculate time.
+    '''
+    def __init__(self):
+        self.__start = time.time()
+    def start(self):
+        '''
+        Start to calculate time.
+        '''
+        self.__start = time.time()
+    def get_elapsed(self, restart=True):
+        '''
+        Calculate time span.
+        '''
+        end = time.time()
+        span = end - self.__start
+        if restart:
+            self.__start = end
+        return span
--- a/examples/trials/mnist-annotation/config.yml
+++ b/examples/trials/mnist-annotation/config.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 1
+#choice: local, remote
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: true
+tuner:
+  #choice: TPE, Random, Anneal, Evolution
+  tunerName: TPE
+  #choice: Maximize, Minimize
+  optimizationMode: Maximize
+trial:
+  trialCommand: python3 mnist.py
+  trialCodeDir: /usr/share/nni/examples/trials/mnist-annotation
+  trialGpuNum: 0
\ No newline at end of file
--- a/examples/trials/mnist-annotation/mnist.py
+++ b/examples/trials/mnist-annotation/mnist.py
+"""A deep MNIST classifier using convolutional layers."""
+import logging
+import math
+import tempfile
+import tensorflow as tf
+from tensorflow.examples.tutorials.mnist import input_data
+FLAGS = None
+logger = logging.getLogger('mnist_AutoML')
+class MnistNetwork(object):
+    '''
+    MnistNetwork is for initlizing and building basic network for mnist.
+    '''
+    def __init__(self,
+                 channel_1_num,
+                 channel_2_num,
+                 conv_size,
+                 hidden_size,
+                 pool_size,
+                 learning_rate,
+                 x_dim=784,
+                 y_dim=10):
+        self.channel_1_num = channel_1_num
+        self.channel_2_num = channel_2_num
+        """@nni.variable(nni.choice(2, 3, 5, 7),name=self.conv_size)"""
+        self.conv_size = conv_size
+        """@nni.variable(nni.choice(124, 512, 1024), name=self.hidden_size)"""
+        self.hidden_size = hidden_size
+        self.pool_size = pool_size
+        """@nni.variable(nni.uniform(0.0001, 0.1), name=self.learning_rate)"""
+        self.learning_rate = learning_rate
+        self.x_dim = x_dim
+        self.y_dim = y_dim
+        self.images = tf.placeholder(tf.float32, [None, self.x_dim], name='input_x')
+        self.labels = tf.placeholder(tf.float32, [None, self.y_dim], name='input_y')
+        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
+        self.train_step = None
+        self.accuracy = None
+    def build_network(self):
+        '''
+        Building network for mnist
+        '''
+        # Reshape to use within a convolutional neural net.
+        # Last dimension is for "features" - there is only one here, since images are
+        # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
+        with tf.name_scope('reshape'):
+            try:
+                input_dim = int(math.sqrt(self.x_dim))
+            except:
+                print(
+                    'input dim cannot be sqrt and reshape. input dim: ' + str(self.x_dim))
+                logger.debug(
+                    'input dim cannot be sqrt and reshape. input dim: %s', str(self.x_dim))
+                raise
+            x_image = tf.reshape(self.images, [-1, input_dim, input_dim, 1])
+        # First convolutional layer - maps one grayscale image to 32 feature maps.
+        with tf.name_scope('conv1'):
+            w_conv1 = weight_variable(
+                [self.conv_size, self.conv_size, 1, self.channel_1_num])
+            b_conv1 = bias_variable([self.channel_1_num])
+            """@nni.function_choice(tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1), tf.nn.sigmoid(conv2d(x_image, w_conv1) + b_conv1), tf.nn.tanh(conv2d(x_image, w_conv1) + b_conv1), name=tf.nn.relu)"""
+            h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
+        # Pooling layer - downsamples by 2X.
+        with tf.name_scope('pool1'):
+            """@nni.function_choice(max_pool(h_conv1, self.pool_size), avg_pool(h_conv1, self.pool_size), name=max_pool)"""
+            h_pool1 = max_pool(h_conv1, self.pool_size)
+        # Second convolutional layer -- maps 32 feature maps to 64.
+        with tf.name_scope('conv2'):
+            w_conv2 = weight_variable([self.conv_size, self.conv_size,
+                                       self.channel_1_num, self.channel_2_num])
+            b_conv2 = bias_variable([self.channel_2_num])
+            h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
+        # Second pooling layer.
+        with tf.name_scope('pool2'):
+            h_pool2 = max_pool(h_conv2, self.pool_size)
+        # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
+        # is down to 7x7x64 feature maps -- maps this to 1024 features.
+        last_dim = int(input_dim / (self.pool_size * self.pool_size))
+        with tf.name_scope('fc1'):
+            w_fc1 = weight_variable(
+                [last_dim * last_dim * self.channel_2_num, self.hidden_size])
+            b_fc1 = bias_variable([self.hidden_size])
+        h_pool2_flat = tf.reshape(
+            h_pool2, [-1, last_dim * last_dim * self.channel_2_num])
+        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
+        # Dropout - controls the complexity of the model, prevents co-adaptation of features.
+        with tf.name_scope('dropout'):
+            h_fc1_drop = tf.nn.dropout(h_fc1, self.keep_prob)
+        # Map the 1024 features to 10 classes, one for each digit
+        with tf.name_scope('fc2'):
+            w_fc2 = weight_variable([self.hidden_size, self.y_dim])
+            b_fc2 = bias_variable([self.y_dim])
+            y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
+        with tf.name_scope('loss'):
+            cross_entropy = tf.reduce_mean(
+                tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=y_conv))
+        with tf.name_scope('adam_optimizer'):
+            self.train_step = tf.train.AdamOptimizer(
+                self.learning_rate).minimize(cross_entropy)
+        with tf.name_scope('accuracy'):
+            correct_prediction = tf.equal(
+                tf.argmax(y_conv, 1), tf.argmax(self.labels, 1))
+            self.accuracy = tf.reduce_mean(
+                tf.cast(correct_prediction, tf.float32))
+def conv2d(x_input, w_matrix):
+    """conv2d returns a 2d convolution layer with full stride."""
+    return tf.nn.conv2d(x_input, w_matrix, strides=[1, 1, 1, 1], padding='SAME')
+def max_pool(x_input, pool_size):
+    """max_pool downsamples a feature map by 2X."""
+    return tf.nn.max_pool(x_input, ksize=[1, pool_size, pool_size, 1],
+                          strides=[1, pool_size, pool_size, 1], padding='SAME')
+def avg_pool(x_input, pool_size):
+    return tf.nn.avg_pool(x_input, ksize=[1, pool_size, pool_size, 1],
+                          strides=[1, pool_size, pool_size, 1], padding='SAME')
+def weight_variable(shape):
+    """weight_variable generates a weight variable of a given shape."""
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+def bias_variable(shape):
+    """bias_variable generates a bias variable of a given shape."""
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+def main(params):
+    '''
+    Main function, build mnist network, run and send result to NNI.
+    '''
+    # Import data
+    mnist = input_data.read_data_sets(params['data_dir'], one_hot=True)
+    print('Mnist download data down.')
+    logger.debug('Mnist download data down.')
+    # Create the model
+    # Build the graph for the deep net
+    mnist_network = MnistNetwork(channel_1_num=params['channel_1_num'],
+                                 channel_2_num=params['channel_2_num'],
+                                 conv_size=params['conv_size'],
+                                 hidden_size=params['hidden_size'],
+                                 pool_size=params['pool_size'],
+                                 learning_rate=params['learning_rate'])
+    mnist_network.build_network()
+    logger.debug('Mnist build network done.')
+    # Write log
+    graph_location = tempfile.mkdtemp()
+    logger.debug('Saving graph to: %s', graph_location)
+    train_writer = tf.summary.FileWriter(graph_location)
+    train_writer.add_graph(tf.get_default_graph())
+    test_acc = 0.0
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        """@nni.variable(nni.choice(50, 250, 500), name=batch_num)"""
+        batch_num = params['batch_num']
+        for i in range(batch_num):
+            batch = mnist.train.next_batch(batch_num)
+            """@nni.variable(nni.choice(1, 5), name=dropout_rate)"""
+            dropout_rate = params['dropout_rate']
+            mnist_network.train_step.run(feed_dict={mnist_network.images: batch[0],
+                                                    mnist_network.labels: batch[1],
+                                                    mnist_network.keep_prob: dropout_rate}
+                                        )
+            if i % 100 == 0:
+                test_acc = mnist_network.accuracy.eval(
+                    feed_dict={mnist_network.images: mnist.test.images,
+                               mnist_network.labels: mnist.test.labels,
+                               mnist_network.keep_prob: 1.0})
+                """@nni.report_intermediate_result(test_acc)"""
+                logger.debug('test accuracy %g', test_acc)
+                logger.debug('Pipe send intermediate result done.')
+        test_acc = mnist_network.accuracy.eval(
+            feed_dict={mnist_network.images: mnist.test.images,
+                       mnist_network.labels: mnist.test.labels,
+                       mnist_network.keep_prob: 1.0})
+        """@nni.report_final_result(test_acc)"""
+        logger.debug('Final result is %g', test_acc)
+        logger.debug('Send final result done.')
+def generate_defualt_params():
+    '''
+    Generate default parameters for mnist network.
+    '''
+    params = {
+        'data_dir': '/tmp/tensorflow/mnist/input_data',
+        'dropout_rate': 0.5,
+        'channel_1_num': 32,
+        'channel_2_num': 64,
+        'conv_size': 5,
+        'pool_size': 2,
+        'hidden_size': 1024,
+        'learning_rate': 1e-4,
+        'batch_num': 200}
+    return params
+if __name__ == '__main__':
+    try:
+        main(generate_defualt_params())
+    except Exception as exception:
+        logger.exception(exception)
+        raise
--- a/examples/trials/mnist-keras/config.yml
+++ b/examples/trials/mnist-keras/config.yml
+authorName: default
+experimentName: example_mnist-keras
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 1
+#choice: local, remote
+trainingServicePlatform: local
+searchSpacePath: /usr/share/nni/examples/trials/mnist-keras/search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution
+  tunerName: TPE
+  #choice: Maximize, Minimize
+  optimizationMode: Maximize
+trial:
+  trialCommand: python3 mnist-keras.py
+  trialCodeDir: /usr/share/nni/examples/trials/mnist-keras
+  trialGpuNum: 0
\ No newline at end of file
--- a/examples/trials/mnist-keras/mnist-keras.py
+++ b/examples/trials/mnist-keras/mnist-keras.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import argparse
+import logging
+import os
+import keras
+import numpy as np
+from keras import backend as K
+from keras.callbacks import TensorBoard
+from keras.datasets import mnist
+from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D
+from keras.models import Sequential
+import nni
+LOG = logging.getLogger('mnist_keras')
+K.set_image_data_format('channels_last')
+TENSORBOARD_DIR = os.environ['NNI_OUTPUT_DIR']
+H, W = 28, 28
+NUM_CLASSES = 10
+def create_mnist_model(hyper_params, input_shape=(H, W, 1), num_classes=NUM_CLASSES):
+    '''
+    Create simple convolutional model
+    '''
+    layers = [
+        Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
+        Conv2D(64, (3, 3), activation='relu'),
+        MaxPooling2D(pool_size=(2, 2)),
+        Flatten(),
+        Dense(100, activation='relu'),
+        Dense(num_classes, activation='softmax')
+    ]
+    model = Sequential(layers)
+    if hyper_params['optimizer'] == 'Adam':
+        optimizer = keras.optimizers.Adam(lr=hyper_params['learning_rate'])
+    else:
+        optimizer = keras.optimizers.SGD(lr=hyper_params['learning_rate'], momentum=0.9)
+    model.compile(loss=keras.losses.categorical_crossentropy, optimizer=optimizer, metrics=['accuracy'])
+    return model
+def load_mnist_data(args):
+    '''
+    Load MNIST dataset
+    '''
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    x_train = (np.expand_dims(x_train, -1).astype(np.float) / 255.)[:args.num_train]
+    x_test = (np.expand_dims(x_test, -1).astype(np.float) / 255.)[:args.num_test]
+    y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)[:args.num_train]
+    y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)[:args.num_test]
+    LOG.debug('x_train shape: %s', (x_train.shape,))
+    LOG.debug('x_test shape: %s', (x_test.shape,))
+    return x_train, y_train, x_test, y_test
+class SendMetrics(keras.callbacks.Callback):
+    '''
+    Keras callback to send metrics to NNI framework
+    '''
+    def on_epoch_end(self, epoch, logs={}):
+        '''
+        Run on end of each epoch
+        '''
+        LOG.debug(logs)
+        nni.report_intermediate_result(logs)
+def train(args, params):
+    '''
+    Train model
+    '''
+    x_train, y_train, x_test, y_test = load_mnist_data(args)
+    model = create_mnist_model(params)
+    model.fit(x_train, y_train, batch_size=args.batch_size, epochs=args.epochs, verbose=1,
+        validation_data=(x_test, y_test), callbacks=[SendMetrics(), TensorBoard(log_dir=TENSORBOARD_DIR)])
+    _, acc = model.evaluate(x_test, y_test, verbose=0)
+    LOG.debug('Final result is: %d', acc)
+    nni.report_final_result(acc)
+def generate_default_params():
+    '''
+    Generate default hyper parameters
+    '''
+    return {
+        'optimizer': 'Adam',
+        'learning_rate': 0.001
+    }
+if __name__ == '__main__':
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--batch_size", type=int, default=200, help="batch size", required=False)
+    PARSER.add_argument("--epochs", type=int, default=10, help="Train epochs", required=False)
+    PARSER.add_argument("--num_train", type=int, default=60000, help="Number of train samples to be used, maximum 60000", required=False)
+    PARSER.add_argument("--num_test", type=int, default=10000, help="Number of test samples to be used, maximum 10000", required=False)
+    ARGS, UNKNOWN = PARSER.parse_known_args()
+    try:
+        # get parameters from tuner
+        RECEIVED_PARAMS = nni.get_parameters()
+        LOG.debug(RECEIVED_PARAMS)
+        PARAMS = generate_default_params()
+        PARAMS.update(RECEIVED_PARAMS)
+        # train
+        train(ARGS, PARAMS)
+    except Exception as e:
+        LOG.exception(e)
+        raise
--- a/examples/trials/mnist-keras/search_space.json
+++ b/examples/trials/mnist-keras/search_space.json
+{
+    "optimizer":{"_type":"choice","_value":["Adam", "SGD"]},
+    "learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.002, 0.005, 0.01]}
+}
--- a/examples/trials/mnist-smartparam/config.yml
+++ b/examples/trials/mnist-smartparam/config.yml
+authorName: default
+experimentName: example_mnist-smartparam
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 1
+#choice: local, remote
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: true
+tuner:
+  #choice: TPE, Random, Anneal, Evolution
+  tunerName: TPE
+  #choice: Maximize, Minimize
+  optimizationMode: Maximize
+trial:
+  trialCommand: python3 mnist.py
+  trialCodeDir: /usr/share/nni/examples/trials/mnist-smartparam
+  trialGpuNum: 0
\ No newline at end of file
--- a/examples/trials/mnist-smartparam/mnist.py
+++ b/examples/trials/mnist-smartparam/mnist.py
--- a/examples/trials/mnist/config.yml
+++ b/examples/trials/mnist/config.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 1
+#choice: local, remote
+trainingServicePlatform: local
+searchSpacePath: /usr/share/nni/examples/trials/mnist/search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution
+  tunerName: TPE
+  #choice: Maximize, Minimize
+  optimizationMode: Maximize
+trial:
+  trialCommand: python3 mnist.py
+  trialCodeDir: /usr/share/nni/examples/trials/mnist
+  trialGpuNum: 0
\ No newline at end of file
--- a/examples/trials/mnist/mnist.py
+++ b/examples/trials/mnist/mnist.py
--- a/examples/trials/mnist/search_space.json
+++ b/examples/trials/mnist/search_space.json
+{
+    "dropout_rate":{"_type":"uniform","_value":[0.1,0.5]},
+    "conv_size":{"_type":"choice","_value":[2,3,5,7]},
+    "hidden_size":{"_type":"choice","_value":[124, 512, 1024]},
+    "learning_rate":{"_type":"uniform","_value":[0.0001, 0.1]}
+}
\ No newline at end of file
--- a/examples/tuners/README.md
+++ b/examples/tuners/README.md