Merge pull request #207 from microsoft/master

merge master

Merge pull request #207 from microsoft/master
merge master
c785655e · SparkSnail · GitHub · 9fae194a · d6b61e2f · c785655e
Unverified Commit c785655e authored Oct 21, 2019 by SparkSnail Committed by GitHub Oct 21, 2019
20 changed files
--- a/examples/model_compress/configure_example.yaml
+++ b/examples/model_compress/configure_example.yaml
+AGPruner: 
+  config:
+    -
+        start_epoch: 1
+        end_epoch: 10
+        frequency: 1
+        initial_sparsity: 0.05
+        final_sparsity: 0.8
+        op_type: 'default'
--- a/examples/model_compress/main_tf_pruner.py
+++ b/examples/model_compress/main_tf_pruner.py
+from nni.compression.tensorflow import AGP_Pruner
+import tensorflow as tf
+from tensorflow.examples.tutorials.mnist import input_data
+def weight_variable(shape):
+    return tf.Variable(tf.truncated_normal(shape, stddev = 0.1))
+def bias_variable(shape):
+    return tf.Variable(tf.constant(0.1, shape = shape))
+def conv2d(x_input, w_matrix):
+    return tf.nn.conv2d(x_input, w_matrix, strides = [ 1, 1, 1, 1 ], padding = 'SAME')
+def max_pool(x_input, pool_size):
+    size = [ 1, pool_size, pool_size, 1 ]
+    return tf.nn.max_pool(x_input, ksize = size, strides = size, padding = 'SAME')
+class Mnist:
+    def __init__(self):
+        images = tf.placeholder(tf.float32, [ None, 784 ], name = 'input_x')
+        labels = tf.placeholder(tf.float32, [ None, 10 ], name = 'input_y')
+        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
+        self.images = images
+        self.labels = labels
+        self.keep_prob = keep_prob
+        self.train_step = None
+        self.accuracy = None
+        self.w1 = None
+        self.b1 = None
+        self.fcw1 = None
+        self.cross = None
+        with tf.name_scope('reshape'):
+            x_image = tf.reshape(images, [ -1, 28, 28, 1 ])
+        with tf.name_scope('conv1'):
+            w_conv1 = weight_variable([ 5, 5, 1, 32 ])
+            self.w1 = w_conv1
+            b_conv1 = bias_variable([ 32 ])
+            self.b1 = b_conv1
+            h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
+        with tf.name_scope('pool1'):
+            h_pool1 = max_pool(h_conv1, 2)
+        with tf.name_scope('conv2'):
+            w_conv2 = weight_variable([ 5, 5, 32, 64 ])
+            b_conv2 = bias_variable([ 64 ])
+            h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
+        with tf.name_scope('pool2'):
+            h_pool2 = max_pool(h_conv2, 2)
+        with tf.name_scope('fc1'):
+            w_fc1 = weight_variable([ 7 * 7 * 64, 1024 ])
+            self.fcw1 = w_fc1
+            b_fc1 = bias_variable([ 1024 ])
+        h_pool2_flat = tf.reshape(h_pool2, [ -1, 7 * 7 * 64 ])
+        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
+        with tf.name_scope('dropout'):
+            h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)
+        with tf.name_scope('fc2'):
+            w_fc2 = weight_variable([ 1024, 10 ])
+            b_fc2 = bias_variable([ 10 ])
+            y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
+        with tf.name_scope('loss'):
+            cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels, logits = y_conv))
+            self.cross = cross_entropy
+        with tf.name_scope('adam_optimizer'):
+            self.train_step = tf.train.AdamOptimizer(0.0001).minimize(cross_entropy)
+        with tf.name_scope('accuracy'):
+            correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(labels, 1))
+            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+def main():
+    tf.set_random_seed(0)
+    data = input_data.read_data_sets('data', one_hot = True)
+    model = Mnist()
+    '''you can change this to SensitivityPruner to implement it
+    pruner = SensitivityPruner(configure_list)
+    '''
+    configure_list = [{
+                        'initial_sparsity': 0,
+                        'final_sparsity': 0.8,
+                        'start_epoch': 1,
+                        'end_epoch': 10,
+                        'frequency': 1,
+                        'op_type': 'default'
+                    }]
+    pruner = AGP_Pruner(configure_list)
+    # if you want to load from yaml file
+    # configure_file = nni.compressors.tf_compressor._nnimc_tf._tf_default_load_configure_file('configure_example.yaml','AGPruner')
+    # configure_list = configure_file.get('config',[])
+    # pruner.load_configure(configure_list)
+    # you can also handle it yourself and input an configure list in json
+    pruner(tf.get_default_graph())
+    # you can also use compress(model) or compress_default_graph() for tensorflow compressor
+    # pruner.compress(tf.get_default_graph())
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        for batch_idx in range(2000):
+            batch = data.train.next_batch(2000)
+            model.train_step.run(feed_dict = {
+                model.images: batch[0],
+                model.labels: batch[1],
+                model.keep_prob: 0.5
+            })
+            if batch_idx % 10 == 0:
+                test_acc = model.accuracy.eval(feed_dict = {
+                    model.images: data.test.images,
+                    model.labels: data.test.labels,
+                    model.keep_prob: 1.0
+                })
+                pruner.update_epoch(batch_idx / 10,sess)
+                print('test accuracy', test_acc)
+        test_acc = model.accuracy.eval(feed_dict = {
+            model.images: data.test.images,
+            model.labels: data.test.labels,
+            model.keep_prob: 1.0
+        })
+        print('final result is', test_acc)
+if __name__ == '__main__':
+    main()
--- a/examples/model_compress/main_tf_quantizer.py
+++ b/examples/model_compress/main_tf_quantizer.py
+from nni.compression.tensorflow import QAT_Quantizer
+import tensorflow as tf
+from tensorflow.examples.tutorials.mnist import input_data
+def weight_variable(shape):
+    return tf.Variable(tf.truncated_normal(shape, stddev = 0.1))
+def bias_variable(shape):
+    return tf.Variable(tf.constant(0.1, shape = shape))
+def conv2d(x_input, w_matrix):
+    return tf.nn.conv2d(x_input, w_matrix, strides = [ 1, 1, 1, 1 ], padding = 'SAME')
+def max_pool(x_input, pool_size):
+    size = [ 1, pool_size, pool_size, 1 ]
+    return tf.nn.max_pool(x_input, ksize = size, strides = size, padding = 'SAME')
+class Mnist:
+    def __init__(self):
+        images = tf.placeholder(tf.float32, [ None, 784 ], name = 'input_x')
+        labels = tf.placeholder(tf.float32, [ None, 10 ], name = 'input_y')
+        keep_prob = tf.placeholder(tf.float32, name='keep_prob')
+        self.images = images
+        self.labels = labels
+        self.keep_prob = keep_prob
+        self.train_step = None
+        self.accuracy = None
+        self.w1 = None
+        self.b1 = None
+        self.fcw1 = None
+        self.cross = None
+        with tf.name_scope('reshape'):
+            x_image = tf.reshape(images, [ -1, 28, 28, 1 ])
+        with tf.name_scope('conv1'):
+            w_conv1 = weight_variable([ 5, 5, 1, 32 ])
+            self.w1 = w_conv1
+            b_conv1 = bias_variable([ 32 ])
+            self.b1 = b_conv1
+            h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
+        with tf.name_scope('pool1'):
+            h_pool1 = max_pool(h_conv1, 2)
+        with tf.name_scope('conv2'):
+            w_conv2 = weight_variable([ 5, 5, 32, 64 ])
+            b_conv2 = bias_variable([ 64 ])
+            h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
+        with tf.name_scope('pool2'):
+            h_pool2 = max_pool(h_conv2, 2)
+        with tf.name_scope('fc1'):
+            w_fc1 = weight_variable([ 7 * 7 * 64, 1024 ])
+            self.fcw1 = w_fc1
+            b_fc1 = bias_variable([ 1024 ])
+        h_pool2_flat = tf.reshape(h_pool2, [ -1, 7 * 7 * 64 ])
+        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
+        with tf.name_scope('dropout'):
+            h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)
+        with tf.name_scope('fc2'):
+            w_fc2 = weight_variable([ 1024, 10 ])
+            b_fc2 = bias_variable([ 10 ])
+            y_conv = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
+        with tf.name_scope('loss'):
+            cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = labels, logits = y_conv))
+            self.cross = cross_entropy
+        with tf.name_scope('adam_optimizer'):
+            self.train_step = tf.train.AdamOptimizer(0.0001).minimize(cross_entropy)
+        with tf.name_scope('accuracy'):
+            correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(labels, 1))
+            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+def main():
+    tf.set_random_seed(0)
+    data = input_data.read_data_sets('data', one_hot = True)
+    model = Mnist()
+    '''you can change this to DoReFaQuantizer to implement it
+    DoReFaQuantizer(configure_list).compress(tf.get_default_graph())
+    '''
+    configure_list = [{'q_bits':8, 'op_type':'default'}]
+    quantizer = QAT_Quantizer(configure_list)
+    quantizer(tf.get_default_graph())
+    # you can also use compress(model) or compress_default_graph()
+    # method like QATquantizer(q_bits = 8).compress_default_graph()
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        for batch_idx in range(2000):
+            batch = data.train.next_batch(2000)
+            model.train_step.run(feed_dict = {
+                model.images: batch[0],
+                model.labels: batch[1],
+                model.keep_prob: 0.5
+            })
+            if batch_idx % 10 == 0:
+                test_acc = model.accuracy.eval(feed_dict = {
+                    model.images: data.test.images,
+                    model.labels: data.test.labels,
+                    model.keep_prob: 1.0
+                })
+                print('test accuracy', test_acc)
+        test_acc = model.accuracy.eval(feed_dict = {
+            model.images: data.test.images,
+            model.labels: data.test.labels,
+            model.keep_prob: 1.0
+        })
+        print('final result is', test_acc)
+if __name__ == '__main__':
+    main()
--- a/examples/model_compress/main_torch_pruner.py
+++ b/examples/model_compress/main_torch_pruner.py
+from nni.compression.torch import AGP_Pruner
+import torch
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+class Mnist(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = torch.nn.Linear(500, 10)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4 * 4 * 50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim = 1)
+def train(model, device, train_loader, optimizer):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 100 == 0:
+            print('{:2.0f}%  Loss {}'.format(100 * batch_idx / len(train_loader), loss.item()))
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction = 'sum').item()
+            pred = output.argmax(dim = 1, keepdim = True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+    print('Loss: {}  Accuracy: {}%)\n'.format(
+        test_loss, 100 * correct / len(test_loader.dataset)))
+def main():
+    torch.manual_seed(0)
+    device = torch.device('cpu')
+    trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train = True, download = True, transform = trans),
+        batch_size = 64, shuffle = True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train = False, transform = trans),
+        batch_size = 1000, shuffle = True)
+    model = Mnist()
+    '''you can change this to SensitivityPruner to implement it
+    pruner = SensitivityPruner(configure_list)
+    '''
+    configure_list = [{
+                        'initial_sparsity': 0,
+                        'final_sparsity': 0.8,
+                        'start_epoch': 1,
+                        'end_epoch': 10,
+                        'frequency': 1,
+                        'op_type': 'default'
+                    }]
+    pruner = AGP_Pruner(configure_list)
+    pruner(model)
+    # you can also use compress(model) method
+    # like that pruner.compress(model)
+    optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5)
+    for epoch in range(10):
+        print('# Epoch {} #'.format(epoch))
+        train(model, device, train_loader, optimizer)
+        test(model, device, test_loader)
+        pruner.update_epoch(epoch)
+if __name__ == '__main__':
+    main()
--- a/examples/model_compress/main_torch_quantizer.py
+++ b/examples/model_compress/main_torch_quantizer.py
+from nni.compression.torch import QAT_Quantizer
+import torch
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+class Mnist(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = torch.nn.Linear(500, 10)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4 * 4 * 50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim = 1)
+def train(model, device, train_loader, optimizer):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 100 == 0:
+            print('{:2.0f}%  Loss {}'.format(100 * batch_idx / len(train_loader), loss.item()))
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction = 'sum').item()
+            pred = output.argmax(dim = 1, keepdim = True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+    print('Loss: {}  Accuracy: {}%)\n'.format(
+        test_loss, 100 * correct / len(test_loader.dataset)))
+def main():
+    torch.manual_seed(0)
+    device = torch.device('cpu')
+    trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train = True, download = True, transform = trans),
+        batch_size = 64, shuffle = True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train = False, transform = trans),
+        batch_size = 1000, shuffle = True)
+    model = Mnist()
+    '''you can change this to DoReFaQuantizer to implement it
+    DoReFaQuantizer(configure_list).compress(model)
+    '''
+    configure_list = [{'q_bits':8, 'op_type':'default'}]
+    quantizer = QAT_Quantizer(configure_list)
+    quantizer(model)
+    # you can also use compress(model) method
+    # like thaht quantizer.compress(model)
+    optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5)
+    for epoch in range(10):
+        print('# Epoch {} #'.format(epoch))
+        train(model, device, train_loader, optimizer)
+        test(model, device, test_loader)
+if __name__ == '__main__':
+    main()
--- a/examples/trials/mnist-nas/classic_mode/config_hpo.yml
+++ b/examples/trials/mnist-nas/classic_mode/config_hpo.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: true
+tuner:
+  builtinTunerName: TPE
+trial:
+  command: python3 mnist.py --batch_num 200
+  codeDir: .
+  gpuNum: 0
+  nasMode: classic_mode
--- a/examples/trials/mnist-nas/config_ppo.yml
+++ b/examples/trials/mnist-nas/config_ppo.yml
+authorName: NNI-example
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 100h
+maxTrialNum: 10000
+#choice: local, remote, pai
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: true
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
+  #SMAC, PPO (SMAC and PPO should be installed through nnictl)
+  builtinTunerName: PPOTuner
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 0
--- a/examples/trials/mnist-pytorch/mnist.py
+++ b/examples/trials/mnist-pytorch/mnist.py
@@ -5,6 +5,7 @@ This file is a modification of the official pytorch mnist example:
 https://github.com/pytorch/examples/blob/master/mnist/main.py
 """
+import os
 import argparse
 import logging
 import nni
@@ -84,15 +85,18 @@ def main(args):
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    data_dir = os.path.join(args['data_dir'], nni.get_trial_id())
    train_loader = torch.utils.data.DataLoader(
-        datasets.MNIST(args['data_dir'], train=True, download=True,
+        datasets.MNIST(data_dir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args['batch_size'], shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
-        datasets.MNIST(args['data_dir'], train=False, transform=transforms.Compose([
+        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),

--- a/examples/trials/nas_cifar10/README.md
+++ b/examples/trials/nas_cifar10/README.md
@@ -2,7 +2,14 @@
 ===	
 Now we have an NAS example [NNI-NAS-Example](https://github.com/Crysple/NNI-NAS-Example) run in NNI using NAS interface from our contributors.	
+We have included its trial code in this folder, and provided example config files to show how to use PPO tuner to tune the trial code.
+> Download data
+- `cd data && . download.sh`
+- `tar xzf cifar-10-python.tar.gz && mv cifar-batches cifar10`
 Thanks our lovely contributors. 	
 And welcome more and more people to join us!
\ No newline at end of file
--- a/examples/trials/nas_cifar10/config_pai_ppo.yml
+++ b/examples/trials/nas_cifar10/config_pai_ppo.yml
+authorName: Unknown
+experimentName: enas_macro
+trialConcurrency: 20
+maxExecDuration: 2400h
+maxTrialNum: 20000
+#choice: local, remote
+trainingServicePlatform: pai
+#choice: true, false
+useAnnotation: true
+multiPhase: false
+versionCheck: false
+nniManagerIp: 0.0.0.0
+tuner:
+  builtinTunerName: PPOTuner
+  classArgs:
+    optimize_mode: maximize
+    trials_per_update: 60
+    epochs_per_update: 20
+    minibatch_size: 6
+trial:
+  command: sh ./macro_cifar10_pai.sh
+  codeDir: ./
+  gpuNum: 1
+  cpuNum: 1
+  memoryMB: 8196
+  image: msranni/nni:latest
+  virtualCluster: nni
+paiConfig:
+  userName: your_account
+  passWord: your_pwd
+  host: 0.0.0.0
--- a/examples/trials/nas_cifar10/config_ppo.yml
+++ b/examples/trials/nas_cifar10/config_ppo.yml
+authorName: Unknown
+experimentName: enas_macro
+trialConcurrency: 4
+maxExecDuration: 2400h
+maxTrialNum: 20000
+#choice: local, remote
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: true
+multiPhase: false
+tuner:
+  builtinTunerName: PPOTuner
+  classArgs:
+    optimize_mode: maximize
+    trials_per_update: 60
+    epochs_per_update: 12
+    minibatch_size: 10
+  #could use the No. 0 gpu for this tuner
+  #if want to specify multiple gpus, here is an example of specifying three gpus: 0,1,2
+  gpuIndices: 0
+trial:
+  command: sh ./macro_cifar10.sh
+  codeDir: ./
+  gpuNum: 1
--- a/examples/trials/nas_cifar10/data/download.sh
+++ b/examples/trials/nas_cifar10/data/download.sh
+wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
--- a/examples/trials/nas_cifar10/macro_cifar10.sh
+++ b/examples/trials/nas_cifar10/macro_cifar10.sh
+#!/bin/bash
+set -e
+export PYTHONPATH="$(pwd)"
+python3 src/cifar10/nni_child_cifar10.py \
+  --data_format="NCHW" \
+  --search_for="macro" \
+  --reset_output_dir \
+  --data_path="data/cifar10" \
+  --output_dir="outputs" \
+  --train_data_size=45000 \
+  --batch_size=100 \
+  --num_epochs=8 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_use_aux_heads \
+  --child_num_layers=12 \
+  --child_out_filters=36 \
+  --child_l2_reg=0.0002 \
+  --child_num_branches=6 \
+  --child_num_cell_layers=5 \
+  --child_keep_prob=0.50 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_lr_cosine \
+  --child_lr_max=0.05 \
+  --child_lr_min=0.001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --child_mode="subgraph" \
+  "$@"
--- a/examples/trials/nas_cifar10/macro_cifar10_pai.sh
+++ b/examples/trials/nas_cifar10/macro_cifar10_pai.sh
+#!/bin/bash
+set -e
+export PYTHONPATH="$(pwd)"
+python3 src/cifar10/nni_child_cifar10.py \
+  --data_format="NCHW" \
+  --search_for="macro" \
+  --reset_output_dir \
+  --data_path="data/cifar10" \
+  --output_dir="outputs" \
+  --train_data_size=45000 \
+  --batch_size=100 \
+  --num_epochs=30 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_use_aux_heads \
+  --child_num_layers=12 \
+  --child_out_filters=36 \
+  --child_l2_reg=0.0002 \
+  --child_num_branches=6 \
+  --child_num_cell_layers=5 \
+  --child_keep_prob=0.50 \
+  --child_drop_path_keep_prob=0.60 \
+  --child_lr_cosine \
+  --child_lr_max=0.05 \
+  --child_lr_min=0.001 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --child_mode="subgraph" \
+  "$@"
--- a/examples/trials/nas_cifar10/src/__init__.py
+++ b/examples/trials/nas_cifar10/src/__init__.py
--- a/examples/trials/nas_cifar10/src/cifar10/__init__.py
+++ b/examples/trials/nas_cifar10/src/cifar10/__init__.py
--- a/examples/trials/nas_cifar10/src/cifar10/data_utils.py
+++ b/examples/trials/nas_cifar10/src/cifar10/data_utils.py
+import os
+import sys
+import pickle
+import numpy as np
+import tensorflow as tf
+def _read_data(data_path, train_files):
+    """Reads CIFAR-10 format data. Always returns NHWC format.
+    Returns:
+        images: np tensor of size [N, H, W, C]
+        labels: np tensor of size [N]
+    """
+    images, labels = [], []
+    for file_name in train_files:
+        print(file_name)
+        full_name = os.path.join(data_path, file_name)
+        with open(full_name, "rb") as finp:
+            data = pickle.load(finp, encoding='latin1')
+            batch_images = data["data"].astype(np.float32) / 255.0
+            batch_labels = np.array(data["labels"], dtype=np.int32)
+            images.append(batch_images)
+            labels.append(batch_labels)
+    images = np.concatenate(images, axis=0)
+    labels = np.concatenate(labels, axis=0)
+    images = np.reshape(images, [-1, 3, 32, 32])
+    images = np.transpose(images, [0, 2, 3, 1])
+    return images, labels
+def read_data(data_path, num_valids=5000):
+    print("-" * 80)
+    print("Reading data")
+    images, labels = {}, {}
+    train_files = [
+        "data_batch_1",
+        "data_batch_2",
+        "data_batch_3",
+        "data_batch_4",
+        "data_batch_5",
+    ]
+    test_file = [
+        "test_batch",
+    ]
+    images["train"], labels["train"] = _read_data(data_path, train_files)
+    if num_valids:
+        images["valid"] = images["train"][-num_valids:]
+        labels["valid"] = labels["train"][-num_valids:]
+        images["train"] = images["train"][:-num_valids]
+        labels["train"] = labels["train"][:-num_valids]
+    else:
+        images["valid"], labels["valid"] = None, None
+    images["test"], labels["test"] = _read_data(data_path, test_file)
+    print("Prepropcess: [subtract mean], [divide std]")
+    mean = np.mean(images["train"], axis=(0, 1, 2), keepdims=True)
+    std = np.std(images["train"], axis=(0, 1, 2), keepdims=True)
+    print("mean: {}".format(np.reshape(mean * 255.0, [-1])))
+    print("std: {}".format(np.reshape(std * 255.0, [-1])))
+    images["train"] = (images["train"] - mean) / std
+    if num_valids:
+        images["valid"] = (images["valid"] - mean) / std
+    images["test"] = (images["test"] - mean) / std
+    return images, labels
--- a/examples/trials/nas_cifar10/src/cifar10/general_child.py
+++ b/examples/trials/nas_cifar10/src/cifar10/general_child.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+from src.common_ops import create_weight, batch_norm, batch_norm_with_mask, global_avg_pool, conv_op, pool_op
+from src.utils import count_model_params, get_train_ops, get_C, get_strides
+from src.cifar10.models import Model
+class GeneralChild(Model):
+    def __init__(self,
+                 images,
+                 labels,
+                 cutout_size=None,
+                 fixed_arc=None,
+                 out_filters_scale=1,
+                 num_layers=2,
+                 num_branches=6,
+                 out_filters=24,
+                 keep_prob=1.0,
+                 batch_size=32,
+                 clip_mode=None,
+                 grad_bound=None,
+                 l2_reg=1e-4,
+                 lr_init=0.1,
+                 lr_dec_start=0,
+                 lr_dec_every=10000,
+                 lr_dec_rate=0.1,
+                 lr_cosine=False,
+                 lr_max=None,
+                 lr_min=None,
+                 lr_T_0=None,
+                 lr_T_mul=None,
+                 optim_algo=None,
+                 sync_replicas=False,
+                 num_aggregate=None,
+                 num_replicas=None,
+                 data_format="NHWC",
+                 name="child",
+                 mode="subgraph",
+                 *args,
+                 **kwargs
+                 ):
+        super(self.__class__, self).__init__(
+            images,
+            labels,
+            cutout_size=cutout_size,
+            batch_size=batch_size,
+            clip_mode=clip_mode,
+            grad_bound=grad_bound,
+            l2_reg=l2_reg,
+            lr_init=lr_init,
+            lr_dec_start=lr_dec_start,
+            lr_dec_every=lr_dec_every,
+            lr_dec_rate=lr_dec_rate,
+            keep_prob=keep_prob,
+            optim_algo=optim_algo,
+            sync_replicas=sync_replicas,
+            num_aggregate=num_aggregate,
+            num_replicas=num_replicas,
+            data_format=data_format,
+            name=name)
+        self.lr_cosine = lr_cosine
+        self.lr_max = lr_max
+        self.lr_min = lr_min
+        self.lr_T_0 = lr_T_0
+        self.lr_T_mul = lr_T_mul
+        self.out_filters = out_filters * out_filters_scale
+        self.num_layers = num_layers
+        self.mode = mode
+        self.num_branches = num_branches
+        self.fixed_arc = fixed_arc
+        self.out_filters_scale = out_filters_scale
+        pool_distance = self.num_layers // 3
+        self.pool_layers = [pool_distance - 1, 2 * pool_distance - 1]
+    def _factorized_reduction(self, x, out_filters, stride, is_training):
+        """Reduces the shape of x without information loss due to striding."""
+        assert out_filters % 2 == 0, (
+            "Need even number of filters when using this factorized reduction.")
+        if stride == 1:
+            with tf.variable_scope("path_conv"):
+                inp_c = get_C(x, self.data_format)
+                w = create_weight("w", [1, 1, inp_c, out_filters])
+                x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME",
+                                 data_format=self.data_format)
+                x = batch_norm(x, is_training, data_format=self.data_format)
+                return x
+        stride_spec = get_strides(stride, self.data_format)
+        # Skip path 1
+        path1 = tf.nn.avg_pool(
+            x, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format)
+        with tf.variable_scope("path1_conv"):
+            inp_c = get_C(path1, self.data_format)
+            w = create_weight("w", [1, 1, inp_c, out_filters // 2])
+            path1 = tf.nn.conv2d(path1, w, [1, 1, 1, 1], "SAME",
+                                 data_format=self.data_format)
+        # Skip path 2
+        # First pad with 0"s on the right and bottom, then shift the filter to
+        # include those 0"s that were added.
+        if self.data_format == "NHWC":
+            pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]]
+            path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :]
+            concat_axis = 3
+        else:
+            pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]]
+            path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:]
+            concat_axis = 1
+        path2 = tf.nn.avg_pool(
+            path2, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format)
+        with tf.variable_scope("path2_conv"):
+            inp_c = get_C(path2, self.data_format)
+            w = create_weight("w", [1, 1, inp_c, out_filters // 2])
+            path2 = tf.nn.conv2d(path2, w, [1, 1, 1, 1], "SAME",
+                                 data_format=self.data_format)
+        # Concat and apply BN
+        final_path = tf.concat(values=[path1, path2], axis=concat_axis)
+        final_path = batch_norm(final_path, is_training,
+                                data_format=self.data_format)
+        return final_path
+    def _model(self, images, is_training, reuse=False):
+        '''Build model'''
+        with tf.variable_scope(self.name, reuse=reuse):
+            layers = []
+            out_filters = self.out_filters
+            with tf.variable_scope("stem_conv"):
+                w = create_weight("w", [3, 3, 3, out_filters])
+                x = tf.nn.conv2d(
+                    images, w, [1, 1, 1, 1], "SAME", data_format=self.data_format)
+                x = batch_norm(x, is_training, data_format=self.data_format)
+                layers.append(x)
+            def add_fixed_pooling_layer(layer_id, layers, out_filters, is_training):
+                '''Add a fixed pooling layer every four layers'''
+                out_filters *= 2
+                with tf.variable_scope("pool_at_{0}".format(layer_id)):
+                    pooled_layers = []
+                    for i, layer in enumerate(layers):
+                        with tf.variable_scope("from_{0}".format(i)):
+                            x = self._factorized_reduction(
+                                layer, out_filters, 2, is_training)
+                        pooled_layers.append(x)
+                    return pooled_layers, out_filters
+            def post_process_out(out, optional_inputs):
+                '''Form skip connection and perform batch norm'''
+                with tf.variable_scope("skip"):
+                    inputs = layers[-1]
+                    if self.data_format == "NHWC":
+                        inp_h = inputs.get_shape()[1].value
+                        inp_w = inputs.get_shape()[2].value
+                        inp_c = inputs.get_shape()[3].value
+                        out.set_shape([None, inp_h, inp_w, out_filters])
+                    elif self.data_format == "NCHW":
+                        inp_c = inputs.get_shape()[1].value
+                        inp_h = inputs.get_shape()[2].value
+                        inp_w = inputs.get_shape()[3].value
+                        out.set_shape([None, out_filters, inp_h, inp_w])
+                    optional_inputs.append(out)
+                    pout = tf.add_n(optional_inputs)
+                    out = batch_norm(pout, is_training,
+                                     data_format=self.data_format)
+                layers.append(out)
+                return out
+            global layer_id
+            layer_id = -1
+            def get_layer_id():
+                global layer_id
+                layer_id += 1
+                return 'layer_' + str(layer_id)
+            def conv3(inputs):
+                # res_layers is pre_layers that are chosen to form skip connection
+                # layers[-1] is always the latest input
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_0'):
+                        out = conv_op(
+                            inputs[0][0], 3, is_training, out_filters, out_filters, self.data_format, start_idx=None)
+                    out = post_process_out(out, inputs[1])
+                return out
+            def conv3_sep(inputs):
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_1'):
+                        out = conv_op(
+                            inputs[0][0], 3, is_training, out_filters, out_filters, self.data_format, start_idx=None, separable=True)
+                    out = post_process_out(out, inputs[1])
+                return out
+            def conv5(inputs):
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_2'):
+                        out = conv_op(
+                            inputs[0][0], 5, is_training, out_filters, out_filters, self.data_format, start_idx=None)
+                    out = post_process_out(out, inputs[1])
+                return out
+            def conv5_sep(inputs):
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_3'):
+                        out = conv_op(
+                            inputs[0][0], 5, is_training, out_filters, out_filters, self.data_format, start_idx=None, separable=True)
+                    out = post_process_out(out, inputs[1])
+                return out
+            def avg_pool(inputs):
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_4'):
+                        out = pool_op(
+                            inputs[0][0], is_training, out_filters, out_filters, "avg", self.data_format, start_idx=None)
+                    out = post_process_out(out, inputs[1])
+                return out
+            def max_pool(inputs):
+                with tf.variable_scope(get_layer_id()):
+                    with tf.variable_scope('branch_5'):
+                        out = pool_op(
+                            inputs[0][0], is_training, out_filters, out_filters, "max", self.data_format, start_idx=None)
+                    out = post_process_out(out, inputs[1])
+                return out
+            """@nni.mutable_layers(
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs:[x],
+                layer_output: layer_0_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs:[layer_0_out],
+                optional_inputs: [layer_0_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_1_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs:[layer_1_out],
+                optional_inputs: [layer_0_out, layer_1_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_2_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs:[layer_2_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_3_out
+            }
+            )"""
+            layers, out_filters = add_fixed_pooling_layer(
+                3, layers, out_filters, is_training)
+            layer_0_out, layer_1_out, layer_2_out, layer_3_out = layers[-4:]
+            """@nni.mutable_layers(
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_3_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_4_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_4_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_5_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_5_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_6_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_6_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_7_out
+            }
+            )"""
+            layers, out_filters = add_fixed_pooling_layer(
+                7, layers, out_filters, is_training)
+            layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out = layers[
+                -8:]
+            """@nni.mutable_layers(
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_7_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_8_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_8_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_9_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs: [layer_9_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_10_out
+            },
+            {
+                layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()],
+                fixed_inputs:[layer_10_out],
+                optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out, layer_10_out],
+                optional_input_size: [0, 1],
+                layer_output: layer_11_out
+            }
+            )"""
+            x = global_avg_pool(layer_11_out, data_format=self.data_format)
+            if is_training:
+                x = tf.nn.dropout(x, self.keep_prob)
+            with tf.variable_scope("fc"):
+                if self.data_format == "NHWC":
+                    inp_c = x.get_shape()[3].value
+                elif self.data_format == "NCHW":
+                    inp_c = x.get_shape()[1].value
+                else:
+                    raise ValueError(
+                        "Unknown data_format {0}".format(self.data_format))
+                w = create_weight("w", [inp_c, 10])
+                x = tf.matmul(x, w)
+        return x
+    # override
+    def _build_train(self):
+        print("-" * 80)
+        print("Build train graph")
+        logits = self._model(self.x_train, is_training=True)
+        log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=self.y_train)
+        self.loss = tf.reduce_mean(log_probs)
+        self.train_preds = tf.argmax(logits, axis=1)
+        self.train_preds = tf.to_int32(self.train_preds)
+        self.train_acc = tf.equal(self.train_preds, self.y_train)
+        self.train_acc = tf.to_int32(self.train_acc)
+        self.train_acc = tf.reduce_sum(self.train_acc)
+        tf_variables = [var
+                        for var in tf.trainable_variables() if var.name.startswith(self.name)]
+        self.num_vars = count_model_params(tf_variables)
+        print("Model has {} params".format(self.num_vars))
+        self.global_step = tf.Variable(
+            0, dtype=tf.int32, trainable=False, name="global_step")
+        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
+            self.loss,
+            tf_variables,
+            self.global_step,
+            clip_mode=self.clip_mode,
+            grad_bound=self.grad_bound,
+            l2_reg=self.l2_reg,
+            lr_init=self.lr_init,
+            lr_dec_start=self.lr_dec_start,
+            lr_dec_every=self.lr_dec_every,
+            lr_dec_rate=self.lr_dec_rate,
+            lr_cosine=self.lr_cosine,
+            lr_max=self.lr_max,
+            lr_min=self.lr_min,
+            lr_T_0=self.lr_T_0,
+            lr_T_mul=self.lr_T_mul,
+            num_train_batches=self.num_train_batches,
+            optim_algo=self.optim_algo,
+            sync_replicas=False,
+            num_aggregate=self.num_aggregate,
+            num_replicas=self.num_replicas)
+    # override
+    def _build_valid(self):
+        if self.x_valid is not None:
+            print("-" * 80)
+            print("Build valid graph")
+            logits = self._model(self.x_valid, False, reuse=True)
+            self.valid_preds = tf.argmax(logits, axis=1)
+            self.valid_preds = tf.to_int32(self.valid_preds)
+            self.valid_acc = tf.equal(self.valid_preds, self.y_valid)
+            self.valid_acc = tf.to_int32(self.valid_acc)
+            self.valid_acc = tf.reduce_sum(self.valid_acc)
+    # override
+    def _build_test(self):
+        print("-" * 80)
+        print("Build test graph")
+        logits = self._model(self.x_test, False, reuse=True)
+        self.test_preds = tf.argmax(logits, axis=1)
+        self.test_preds = tf.to_int32(self.test_preds)
+        self.test_acc = tf.equal(self.test_preds, self.y_test)
+        self.test_acc = tf.to_int32(self.test_acc)
+        self.test_acc = tf.reduce_sum(self.test_acc)
+    def build_model(self):
+        self._build_train()
+        self._build_valid()
+        self._build_test()
--- a/examples/trials/nas_cifar10/src/cifar10/models.py
+++ b/examples/trials/nas_cifar10/src/cifar10/models.py
+import os
+import sys
+import numpy as np
+import tensorflow as tf
+class Model(object):
+    def __init__(self,
+                 images,
+                 labels,
+                 cutout_size=None,
+                 batch_size=32,
+                 eval_batch_size=100,
+                 clip_mode=None,
+                 grad_bound=None,
+                 l2_reg=1e-4,
+                 lr_init=0.1,
+                 lr_dec_start=0,
+                 lr_dec_every=100,
+                 lr_dec_rate=0.1,
+                 keep_prob=1.0,
+                 optim_algo=None,
+                 sync_replicas=False,
+                 num_aggregate=None,
+                 num_replicas=None,
+                 data_format="NHWC",
+                 name="generic_model",
+                 seed=None,
+                 ):
+        """
+        Args:
+                lr_dec_every: number of epochs to decay
+        """
+        print("-" * 80)
+        print("Build model {}".format(name))
+        self.cutout_size = cutout_size
+        self.batch_size = batch_size
+        self.eval_batch_size = eval_batch_size
+        self.clip_mode = clip_mode
+        self.grad_bound = grad_bound
+        self.l2_reg = l2_reg
+        self.lr_init = lr_init
+        self.lr_dec_start = lr_dec_start
+        self.lr_dec_rate = lr_dec_rate
+        self.keep_prob = keep_prob
+        self.optim_algo = optim_algo
+        self.sync_replicas = sync_replicas
+        self.num_aggregate = num_aggregate
+        self.num_replicas = num_replicas
+        self.data_format = data_format
+        self.name = name
+        self.seed = seed
+        self.global_step = None
+        self.valid_acc = None
+        self.test_acc = None
+        print("Build data ops")
+        with tf.device("/cpu:0"):
+            # training data
+            self.num_train_examples = np.shape(images["train"])[0]
+            self.num_train_batches = (
+                self.num_train_examples + self.batch_size - 1) // self.batch_size
+            x_train, y_train = tf.train.shuffle_batch(
+                [images["train"], labels["train"]],
+                batch_size=self.batch_size,
+                capacity=50000,
+                enqueue_many=True,
+                min_after_dequeue=0,
+                num_threads=16,
+                seed=self.seed,
+                allow_smaller_final_batch=True,
+            )
+            self.lr_dec_every = lr_dec_every * self.num_train_batches
+            def _pre_process(x):
+                x = tf.pad(x, [[4, 4], [4, 4], [0, 0]])
+                x = tf.random_crop(x, [32, 32, 3], seed=self.seed)
+                x = tf.image.random_flip_left_right(x, seed=self.seed)
+                if self.cutout_size is not None:
+                    mask = tf.ones(
+                        [self.cutout_size, self.cutout_size], dtype=tf.int32)
+                    start = tf.random_uniform(
+                        [2], minval=0, maxval=32, dtype=tf.int32)
+                    mask = tf.pad(mask, [[self.cutout_size + start[0], 32 - start[0]],
+                                         [self.cutout_size + start[1], 32 - start[1]]])
+                    mask = mask[self.cutout_size: self.cutout_size + 32,
+                                self.cutout_size: self.cutout_size + 32]
+                    mask = tf.reshape(mask, [32, 32, 1])
+                    mask = tf.tile(mask, [1, 1, 3])
+                    x = tf.where(tf.equal(mask, 0), x=x, y=tf.zeros_like(x))
+                if self.data_format == "NCHW":
+                    x = tf.transpose(x, [2, 0, 1])
+                return x
+            self.x_train = tf.map_fn(_pre_process, x_train, back_prop=False)
+            self.y_train = y_train
+            # valid data
+            self.x_valid, self.y_valid = None, None
+            if images["valid"] is not None:
+                images["valid_original"] = np.copy(images["valid"])
+                labels["valid_original"] = np.copy(labels["valid"])
+                if self.data_format == "NCHW":
+                    images["valid"] = tf.transpose(
+                        images["valid"], [0, 3, 1, 2])
+                self.num_valid_examples = np.shape(images["valid"])[0]
+                self.num_valid_batches = (
+                    (self.num_valid_examples + self.eval_batch_size - 1)
+                    // self.eval_batch_size)
+                self.x_valid, self.y_valid = tf.train.batch(
+                    [images["valid"], labels["valid"]],
+                    batch_size=self.eval_batch_size,
+                    capacity=5000,
+                    enqueue_many=True,
+                    num_threads=1,
+                    allow_smaller_final_batch=True,
+                )
+            # test data
+            if self.data_format == "NCHW":
+                images["test"] = tf.transpose(images["test"], [0, 3, 1, 2])
+            self.num_test_examples = np.shape(images["test"])[0]
+            self.num_test_batches = (
+                (self.num_test_examples + self.eval_batch_size - 1)
+                // self.eval_batch_size)
+            self.x_test, self.y_test = tf.train.batch(
+                [images["test"], labels["test"]],
+                batch_size=self.eval_batch_size,
+                capacity=10000,
+                enqueue_many=True,
+                num_threads=1,
+                allow_smaller_final_batch=True,
+            )
+        # cache images and labels
+        self.images = images
+        self.labels = labels
+    def eval_once(self, sess, eval_set, child_model, verbose=False):
+        """Expects self.acc and self.global_step to be defined.
+        Args:
+                sess: tf.Session() or one of its wrap arounds.
+                feed_dict: can be used to give more information to sess.run().
+                eval_set: "valid" or "test"
+        """
+        assert self.global_step is not None
+        global_step = sess.run(self.global_step)
+        print("Eval at {}".format(global_step))
+        if eval_set == "valid":
+            assert self.x_valid is not None
+            assert self.valid_acc is not None
+            num_examples = self.num_valid_examples
+            num_batches = self.num_valid_batches
+            acc_op = self.valid_acc
+        elif eval_set == "test":
+            assert self.test_acc is not None
+            num_examples = self.num_test_examples
+            num_batches = self.num_test_batches
+            acc_op = self.test_acc
+        else:
+            raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
+        total_acc = 0
+        total_exp = 0
+        for batch_id in range(num_batches):
+            acc = sess.run(acc_op)
+            total_acc += acc
+            total_exp += self.eval_batch_size
+            if verbose:
+                sys.stdout.write(
+                    "\r{:<5d}/{:>5d}".format(total_acc, total_exp))
+        if verbose:
+            print("")
+        print("{}_accuracy: {:<6.4f}".format(
+            eval_set, float(total_acc) / total_exp))
+        return float(total_acc) / total_exp
+    def _model(self, images, is_training, reuse=None):
+        raise NotImplementedError("Abstract method")
+    def _build_train(self):
+        raise NotImplementedError("Abstract method")
+    def _build_valid(self):
+        raise NotImplementedError("Abstract method")
+    def _build_test(self):
+        raise NotImplementedError("Abstract method")
--- a/examples/trials/nas_cifar10/src/cifar10/nni_child_cifar10.py
+++ b/examples/trials/nas_cifar10/src/cifar10/nni_child_cifar10.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import shutil
+import logging
+import tensorflow as tf
+from src.cifar10.data_utils import read_data
+from src.cifar10.general_child import GeneralChild
+import src.cifar10_flags
+from src.cifar10_flags import FLAGS
+def build_logger(log_name):
+    logger = logging.getLogger(log_name)
+    logger.setLevel(logging.DEBUG)
+    fh = logging.FileHandler(log_name+'.log')
+    fh.setLevel(logging.DEBUG)
+    logger.addHandler(fh)
+    return logger
+logger = build_logger("nni_child_cifar10")
+def build_trial(images, labels, ChildClass):
+    '''Build child class'''
+    child_model = ChildClass(
+        images,
+        labels,
+        use_aux_heads=FLAGS.child_use_aux_heads,
+        cutout_size=FLAGS.child_cutout_size,
+        num_layers=FLAGS.child_num_layers,
+        num_cells=FLAGS.child_num_cells,
+        num_branches=FLAGS.child_num_branches,
+        fixed_arc=FLAGS.child_fixed_arc,
+        out_filters_scale=FLAGS.child_out_filters_scale,
+        out_filters=FLAGS.child_out_filters,
+        keep_prob=FLAGS.child_keep_prob,
+        drop_path_keep_prob=FLAGS.child_drop_path_keep_prob,
+        num_epochs=FLAGS.num_epochs,
+        l2_reg=FLAGS.child_l2_reg,
+        data_format=FLAGS.data_format,
+        batch_size=FLAGS.batch_size,
+        clip_mode="norm",
+        grad_bound=FLAGS.child_grad_bound,
+        lr_init=FLAGS.child_lr,
+        lr_dec_every=FLAGS.child_lr_dec_every,
+        lr_dec_rate=FLAGS.child_lr_dec_rate,
+        lr_cosine=FLAGS.child_lr_cosine,
+        lr_max=FLAGS.child_lr_max,
+        lr_min=FLAGS.child_lr_min,
+        lr_T_0=FLAGS.child_lr_T_0,
+        lr_T_mul=FLAGS.child_lr_T_mul,
+        optim_algo="momentum",
+        sync_replicas=FLAGS.child_sync_replicas,
+        num_aggregate=FLAGS.child_num_aggregate,
+        num_replicas=FLAGS.child_num_replicas
+    )
+    return child_model
+def get_child_ops(child_model):
+    '''Assemble child op to a dict'''
+    child_ops = {
+        "global_step": child_model.global_step,
+        "loss": child_model.loss,
+        "train_op": child_model.train_op,
+        "lr": child_model.lr,
+        "grad_norm": child_model.grad_norm,
+        "train_acc": child_model.train_acc,
+        "optimizer": child_model.optimizer,
+        "num_train_batches": child_model.num_train_batches,
+        "eval_every": child_model.num_train_batches * FLAGS.eval_every_epochs,
+        "eval_func": child_model.eval_once,
+    }
+    return child_ops
+class NASTrial():
+    def __init__(self):
+        images, labels = read_data(FLAGS.data_path, num_valids=0)
+        self.output_dir = os.path.join(os.getenv('NNI_OUTPUT_DIR'), '../..')
+        self.file_path = os.path.join(
+            self.output_dir, 'trainable_variable.txt')
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            self.child_model = build_trial(images, labels, GeneralChild)
+            self.total_data = {}
+            self.child_model.build_model()
+            self.child_ops = get_child_ops(self.child_model)
+            config = tf.ConfigProto(
+                intra_op_parallelism_threads=0,
+                inter_op_parallelism_threads=0,
+                allow_soft_placement=True)
+            self.sess = tf.train.SingularMonitoredSession(config=config)
+        logger.debug('initlize NASTrial done.')
+    def run_one_step(self):
+        '''Run this model on a batch of data'''
+        run_ops = [
+            self.child_ops["loss"],
+            self.child_ops["lr"],
+            self.child_ops["grad_norm"],
+            self.child_ops["train_acc"],
+            self.child_ops["train_op"],
+        ]
+        loss, lr, gn, tr_acc, _ = self.sess.run(run_ops)
+        global_step = self.sess.run(self.child_ops["global_step"])
+        log_string = ""
+        log_string += "ch_step={:<6d}".format(global_step)
+        log_string += " loss={:<8.6f}".format(loss)
+        log_string += " lr={:<8.4f}".format(lr)
+        log_string += " |g|={:<8.4f}".format(gn)
+        log_string += " tr_acc={:<3d}/{:>3d}".format(tr_acc, FLAGS.batch_size)
+        if int(global_step) % FLAGS.log_every == 0:
+            logger.debug(log_string)
+        return loss, global_step
+    def run(self):
+        '''Run this model according to the `epoch` set in FALGS'''
+        max_acc = 0
+        while True:
+            _, global_step = self.run_one_step()
+            if global_step % self.child_ops['num_train_batches'] == 0:
+                acc = self.child_ops["eval_func"](
+                    self.sess, "test", self.child_model)
+                max_acc = max(max_acc, acc)
+                '''@nni.report_intermediate_result(acc)'''
+            if global_step / self.child_ops['num_train_batches'] >= FLAGS.num_epochs:
+                '''@nni.report_final_result(max_acc)'''
+                break
+def main(_):
+    logger.debug("-" * 80)
+    if not os.path.isdir(FLAGS.output_dir):
+        logger.debug(
+            "Path {} does not exist. Creating.".format(FLAGS.output_dir))
+        os.makedirs(FLAGS.output_dir)
+    elif FLAGS.reset_output_dir:
+        logger.debug(
+            "Path {} exists. Remove and remake.".format(FLAGS.output_dir))
+        shutil.rmtree(FLAGS.output_dir)
+        os.makedirs(FLAGS.output_dir)
+    logger.debug("-" * 80)
+    trial = NASTrial()
+    trial.run()
+if __name__ == "__main__":
+    tf.app.run()