Merge pull request #218 from microsoft/master

merge master

Merge pull request #218 from microsoft/master
merge master
cd3a912a · SparkSnail · GitHub · a0846f2a · e9cba778 · cd3a912a
Unverified Commit cd3a912a authored Nov 27, 2019 by SparkSnail Committed by GitHub Nov 27, 2019
20 changed files
--- a/examples/feature_engineering/gradient_feature_selector/benchmark_test.py
+++ b/examples/feature_engineering/gradient_feature_selector/benchmark_test.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import bz2
+import urllib.request
+import numpy as np
+
+import os
+
+from sklearn.datasets import load_svmlight_file
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.feature_selection import SelectFromModel
+
+from nni.feature_engineering.gradient_selector import FeatureGradientSelector
+
+
+class Benchmark():
+
+    def __init__(self, files, test_size = 0.2):
+        self.files =  files
+        self.test_size = test_size
+
+
+    def run_all_test(self, pipeline):
+        for file_name in self.files:
+            file_path = self.files[file_name]
+
+            self.run_test(pipeline, file_name, file_path)
+
+
+    def run_test(self, pipeline, name, path):
+        print("download " + name)
+        update_name = self.download(name, path)
+        X, y = load_svmlight_file(update_name)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
+        
+        pipeline.fit(X_train, y_train)
+        print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test))
+
+
+    def download(self, name, path):
+        old_name = name + '_train.bz2'
+        update_name = name + '_train.svm'
+
+        if os.path.exists(old_name) and os.path.exists(update_name):
+            return update_name
+
+        urllib.request.urlretrieve(path, filename=old_name)
+
+        f_svm = open(update_name, 'wt')
+        with bz2.open(old_name, 'rb') as f_zip:
+            data = f_zip.read()
+            f_svm.write(data.decode('utf-8'))
+        f_svm.close()
+
+        return update_name
+
+
+if __name__ == "__main__":
+    LIBSVM_DATA = {
+        "rcv1" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2",
+        # "avazu" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2",
+        "colon-cancer" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2",
+        "gisette" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/gisette_scale.bz2",
+        # "kdd2010" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.bz2",
+        # "kdd2012" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdd12.bz2",
+        "news20.binary" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2",
+        "real-sim" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/real-sim.bz2",
+        "webspam" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/webspam_wc_normalized_trigram.svm.bz2"
+    }
+
+    test_benchmark = Benchmark(LIBSVM_DATA)
+
+    pipeline1 = make_pipeline(LogisticRegression())
+    print("Test all data in LogisticRegression.")
+    print()
+    test_benchmark.run_all_test(pipeline1)
+
+    pipeline2 = make_pipeline(FeatureGradientSelector(n_features=20), LogisticRegression())
+    print("Test data selected by FeatureGradientSelector in LogisticRegression.")
+    print()
+    test_benchmark.run_all_test(pipeline2)
+
+    pipeline3 = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
+    print("Test data selected by TreeClssifier in LogisticRegression.")
+    print()
+    test_benchmark.run_all_test(pipeline3)
+    
+    print("Done.")
\ No newline at end of file
--- a/examples/feature_engineering/gradient_feature_selector/sklearn_test.py
+++ b/examples/feature_engineering/gradient_feature_selector/sklearn_test.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import bz2
+import urllib.request
+import numpy as np
+
+from sklearn.datasets import load_svmlight_file
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.feature_selection import SelectFromModel
+
+from nni.feature_engineering.gradient_selector import FeatureGradientSelector
+
+url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
+urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
+
+f_svm = open('train.svm', 'wt')
+with bz2.open('train.bz2', 'rb') as f_zip:
+    data = f_zip.read()
+    f_svm.write(data.decode('utf-8'))
+f_svm.close()
+
+
+X, y = load_svmlight_file('train.svm')
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+
+fgs = FeatureGradientSelector(n_features=10)
+fgs.fit(X_train, y_train)
+
+print("selected features\t", fgs.get_selected_features())
+
+pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
+# pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
+pipeline.fit(X_train, y_train)
+
+print("Pipeline Score: ", pipeline.score(X_train, y_train))
\ No newline at end of file
--- a/examples/model_compress/QAT_torch_quantizer.py
+++ b/examples/model_compress/QAT_torch_quantizer.py
+import torch
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+from nni.compression.torch import QAT_Quantizer
+
+
+class Mnist(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = torch.nn.Linear(500, 10)
+        self.relu1 = torch.nn.ReLU6()
+        self.relu2 = torch.nn.ReLU6()
+        self.relu3 = torch.nn.ReLU6()
+
+    def forward(self, x):
+        x = self.relu1(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = self.relu2(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4 * 4 * 50)
+        x = self.relu3(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train(model, quantizer, device, train_loader, optimizer):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        quantizer.step()
+        if batch_idx % 100 == 0:
+            print('{:2.0f}%  Loss {}'.format(100 * batch_idx / len(train_loader), loss.item()))
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+
+    print('Loss: {}  Accuracy: {}%)\n'.format(
+        test_loss, 100 * correct / len(test_loader.dataset)))
+
+def main():
+    torch.manual_seed(0)
+    device = torch.device('cpu')
+
+    trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=True, download=True, transform=trans),
+        batch_size=64, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=False, transform=trans),
+        batch_size=1000, shuffle=True)
+
+    model = Mnist()
+
+    '''you can change this to DoReFaQuantizer to implement it
+    DoReFaQuantizer(configure_list).compress(model)
+    '''
+    configure_list = [{
+        'quant_types': ['weight'],
+        'quant_bits': {
+            'weight': 8,
+        }, # you can just use `int` here because all `quan_types` share same bits length, see config for `ReLu6` below.
+        'op_types':['Conv2d', 'Linear']
+    }, {
+        'quant_types': ['output'],
+        'quant_bits': 8,
+        'quant_start_step': 7000,
+        'op_types':['ReLU6']
+    }]
+    quantizer = QAT_Quantizer(model, configure_list)
+    quantizer.compress()
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    for epoch in range(10):
+        print('# Epoch {} #'.format(epoch))
+        train(model, quantizer, device, train_loader, optimizer)
+        test(model, device, test_loader)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/model_compress/slim_pruner_torch_vgg19.py
+++ b/examples/model_compress/slim_pruner_torch_vgg19.py
@@ -169,7 +169,7 @@ def main():
    new_model.to(device)
    new_model.load_state_dict(torch.load('pruned_vgg19_cifar10.pth'))
    test(new_model, device, test_loader)
-    # top1 = 93.61%
+    # top1 = 93.74%


 if __name__ == '__main__':

--- a/examples/nas/.gitignore
+++ b/examples/nas/.gitignore
+data
+checkpoints
+runs
--- a/examples/nas/classic_nas/config_nas.yml
+++ b/examples/nas/classic_nas/config_nas.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: local
+#please use `nnictl ss_gen` to generate search space file first
+searchSpacePath: <the_generated_search_space_path>
+useAnnotation: False
+tuner:
+  codeDir: ../../tuners/random_nas_tuner
+  classFileName: random_nas_tuner.py
+  className: RandomNASTuner
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 0
--- a/examples/nas/classic_nas/config_ppo.yml
+++ b/examples/nas/classic_nas/config_ppo.yml
+authorName: default
+experimentName: example_mnist
+trialConcurrency: 1
+maxExecDuration: 100h
+maxTrialNum: 1000
+#choice: local, remote, pai
+trainingServicePlatform: local
+#please use `nnictl ss_gen` to generate search space file first
+searchSpacePath: <the_generated_search_space_path>
+useAnnotation: False
+tuner:
+  builtinTunerName: PPOTuner
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: python3 mnist.py
+  codeDir: .
+  gpuNum: 0
--- a/examples/nas/classic_nas/mnist.py
+++ b/examples/nas/classic_nas/mnist.py
+"""
+A deep MNIST classifier using convolutional layers.
+
+This file is a modification of the official pytorch mnist example:
+https://github.com/pytorch/examples/blob/master/mnist/main.py
+"""
+
+import os
+import argparse
+import logging
+import nni
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+
+from nni.nas.pytorch.mutables import LayerChoice, InputChoice
+from nni.nas.pytorch.classic_nas import get_and_apply_next_architecture
+
+
+logger = logging.getLogger('mnist_AutoML')
+
+
+class Net(nn.Module):
+    def __init__(self, hidden_size):
+        super(Net, self).__init__()
+        # two options of conv1
+        self.conv1 = LayerChoice([nn.Conv2d(1, 20, 5, 1),
+                                  nn.Conv2d(1, 20, 3, 1)],
+                                 key='first_conv')
+        # two options of mid_conv
+        self.mid_conv = LayerChoice([nn.Conv2d(20, 20, 3, 1, padding=1),
+                                     nn.Conv2d(20, 20, 5, 1, padding=2)],
+                                    key='mid_conv')
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = nn.Linear(4*4*50, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, 10)
+        # skip connection over mid_conv
+        self.input_switch = InputChoice(n_candidates=2,
+                                        n_chosen=1,
+                                        key='skip')
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        old_x = x
+        x = F.relu(self.mid_conv(x))
+        zero_x = torch.zeros_like(old_x)
+        skip_x = self.input_switch([zero_x, old_x])
+        x = torch.add(x, skip_x)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4*4*50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args['log_interval'] == 0:
+            logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test(args, model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            # get the index of the max log-probability
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    accuracy = 100. * correct / len(test_loader.dataset)
+
+    logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset), accuracy))
+
+    return accuracy
+
+
+def main(args):
+    use_cuda = not args['no_cuda'] and torch.cuda.is_available()
+
+    torch.manual_seed(args['seed'])
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+
+    #data_dir = os.path.join(args['data_dir'], nni.get_trial_id())
+    data_dir = os.path.join(args['data_dir'], 'data')
+
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(data_dir, train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args['batch_size'], shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ])),
+        batch_size=1000, shuffle=True, **kwargs)
+
+    hidden_size = args['hidden_size']
+
+    model = Net(hidden_size=hidden_size).to(device)
+    get_and_apply_next_architecture(model)
+    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
+                          momentum=args['momentum'])
+
+    for epoch in range(1, args['epochs'] + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test_acc = test(args, model, device, test_loader)
+
+        if epoch < args['epochs']:
+            # report intermediate result
+            nni.report_intermediate_result(test_acc)
+            logger.debug('test accuracy %g', test_acc)
+            logger.debug('Pipe send intermediate result done.')
+        else:
+            # report final result
+            nni.report_final_result(test_acc)
+            logger.debug('Final result is %g', test_acc)
+            logger.debug('Send final result done.')
+
+
+def get_params():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument("--data_dir", type=str,
+                        default='/tmp/tensorflow/mnist/input_data', help="data directory")
+    parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument("--hidden_size", type=int, default=512, metavar='N',
+                        help='hidden layer size (default: 512)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--no_cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
+                        help='how many batches to wait before logging training status')
+
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+if __name__ == '__main__':
+    try:
+        params = vars(get_params())
+        main(params)
+    except Exception as exception:
+        logger.exception(exception)
+        raise
--- a/examples/nas/darts/datasets.py
+++ b/examples/nas/darts/datasets.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+
+        return img
+
+
+def get_dataset(cls, cutout_length=0):
+    MEAN = [0.49139968, 0.48215827, 0.44653124]
+    STD = [0.24703233, 0.24348505, 0.26158768]
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD)
+    ]
+    cutout = []
+    if cutout_length > 0:
+        cutout.append(Cutout(cutout_length))
+
+    train_transform = transforms.Compose(transf + normalize + cutout)
+    valid_transform = transforms.Compose(normalize)
+
+    if cls == "cifar10":
+        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
+        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
+    else:
+        raise NotImplementedError
+    return dataset_train, dataset_valid
--- a/examples/nas/darts/model.py
+++ b/examples/nas/darts/model.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+
+import ops
+from nni.nas.pytorch import mutables
+
+
+class AuxiliaryHead(nn.Module):
+    """ Auxiliary head in 2/3 place of network to let the gradient flow well """
+
+    def __init__(self, input_size, C, n_classes):
+        """ assuming input size 7x7 or 8x8 """
+        assert input_size in [7, 8]
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False),  # 2x2 out
+            nn.Conv2d(C, 128, kernel_size=1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 768, kernel_size=2, bias=False),  # 1x1 out
+            nn.BatchNorm2d(768),
+            nn.ReLU(inplace=True)
+        )
+        self.linear = nn.Linear(768, n_classes)
+
+    def forward(self, x):
+        out = self.net(x)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+        return logits
+
+
+class Node(nn.Module):
+    def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect):
+        super().__init__()
+        self.ops = nn.ModuleList()
+        choice_keys = []
+        for i in range(num_prev_nodes):
+            stride = 2 if i < num_downsample_connect else 1
+            choice_keys.append("{}_p{}".format(node_id, i))
+            self.ops.append(
+                mutables.LayerChoice(
+                    [
+                        ops.PoolBN('max', channels, 3, stride, 1, affine=False),
+                        ops.PoolBN('avg', channels, 3, stride, 1, affine=False),
+                        nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False),
+                        ops.SepConv(channels, channels, 3, stride, 1, affine=False),
+                        ops.SepConv(channels, channels, 5, stride, 2, affine=False),
+                        ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False),
+                        ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False)
+                    ],
+                    key=choice_keys[-1]))
+        self.drop_path = ops.DropPath()
+        self.input_switch = mutables.InputChoice(choose_from=choice_keys, n_chosen=2, key="{}_switch".format(node_id))
+
+    def forward(self, prev_nodes):
+        assert len(self.ops) == len(prev_nodes)
+        out = [op(node) for op, node in zip(self.ops, prev_nodes)]
+        out = [self.drop_path(o) if o is not None else None for o in out]
+        return self.input_switch(out)
+
+
+class Cell(nn.Module):
+
+    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
+        super().__init__()
+        self.reduction = reduction
+        self.n_nodes = n_nodes
+
+        # If previous cell is reduction cell, current input size does not match with
+        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
+        if reduction_p:
+            self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
+        else:
+            self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
+        self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)
+
+        # generate dag
+        self.mutable_ops = nn.ModuleList()
+        for depth in range(2, self.n_nodes + 2):
+            self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth),
+                                         depth, channels, 2 if reduction else 0))
+
+    def forward(self, s0, s1):
+        # s0, s1 are the outputs of previous previous cell and previous cell, respectively.
+        tensors = [self.preproc0(s0), self.preproc1(s1)]
+        for node in self.mutable_ops:
+            cur_tensor = node(tensors)
+            tensors.append(cur_tensor)
+
+        output = torch.cat(tensors[2:], dim=1)
+        return output
+
+
+class CNN(nn.Module):
+
+    def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
+                 stem_multiplier=3, auxiliary=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.channels = channels
+        self.n_classes = n_classes
+        self.n_layers = n_layers
+        self.aux_pos = 2 * n_layers // 3 if auxiliary else -1
+
+        c_cur = stem_multiplier * self.channels
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(c_cur)
+        )
+
+        # for the first cell, stem is used for both s0 and s1
+        # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
+        channels_pp, channels_p, c_cur = c_cur, c_cur, channels
+
+        self.cells = nn.ModuleList()
+        reduction_p, reduction = False, False
+        for i in range(n_layers):
+            reduction_p, reduction = reduction, False
+            # Reduce featuremap size and double channels in 1/3 and 2/3 layer.
+            if i in [n_layers // 3, 2 * n_layers // 3]:
+                c_cur *= 2
+                reduction = True
+
+            cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction)
+            self.cells.append(cell)
+            c_cur_out = c_cur * n_nodes
+            channels_pp, channels_p = channels_p, c_cur_out
+
+            if i == self.aux_pos:
+                self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)
+
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.linear = nn.Linear(channels_p, n_classes)
+
+    def forward(self, x):
+        s0 = s1 = self.stem(x)
+
+        aux_logits = None
+        for i, cell in enumerate(self.cells):
+            s0, s1 = s1, cell(s0, s1)
+            if i == self.aux_pos and self.training:
+                aux_logits = self.aux_head(s1)
+
+        out = self.gap(s1)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+
+        if aux_logits is not None:
+            return logits, aux_logits
+        return logits
+
+    def drop_path_prob(self, p):
+        for module in self.modules():
+            if isinstance(module, ops.DropPath):
+                module.p = p
--- a/examples/nas/darts/ops.py
+++ b/examples/nas/darts/ops.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+
+
+class DropPath(nn.Module):
+    def __init__(self, p=0.):
+        """
+        Drop path with probability.
+
+        Parameters
+        ----------
+        p : float
+            Probability of an path to be zeroed.
+        """
+        super().__init__()
+        self.p = p
+
+    def forward(self, x):
+        if self.training and self.p > 0.:
+            keep_prob = 1. - self.p
+            # per data point mask
+            mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
+            return x / keep_prob * mask
+
+        return x
+
+
+class PoolBN(nn.Module):
+    """
+    AvgPool or MaxPool with BN. `pool_type` must be `max` or `avg`.
+    """
+    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        if pool_type.lower() == 'max':
+            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
+        elif pool_type.lower() == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
+        else:
+            raise ValueError()
+
+        self.bn = nn.BatchNorm2d(C, affine=affine)
+
+    def forward(self, x):
+        out = self.pool(x)
+        out = self.bn(out)
+        return out
+
+
+class StdConv(nn.Module):
+    """
+    Standard conv: ReLU - Conv - BN
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FacConv(nn.Module):
+    """
+    Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN
+    """
+    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
+            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class DilConv(nn.Module):
+    """
+    (Dilated) depthwise separable conv.
+    ReLU - (Dilated) depthwise separable - Pointwise - BN.
+    If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field.
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
+                      bias=False),
+            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SepConv(nn.Module):
+    """
+    Depthwise separable conv.
+    DilConv(dilation=1) * 2.
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
+            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FactorizedReduce(nn.Module):
+    """
+    Reduce feature map size by factorized pointwise (stride=2).
+    """
+    def __init__(self, C_in, C_out, affine=True):
+        super().__init__()
+        self.relu = nn.ReLU()
+        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+    def forward(self, x):
+        x = self.relu(x)
+        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
--- a/examples/nas/darts/retrain.py
+++ b/examples/nas/darts/retrain.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import time
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+from torch.utils.tensorboard import SummaryWriter
+
+import datasets
+import utils
+from model import CNN
+from nni.nas.pytorch.fixed import apply_fixed_architecture
+from nni.nas.pytorch.utils import AverageMeter
+
+logger = logging.getLogger('nni')
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+writer = SummaryWriter()
+
+
+def train(config, train_loader, model, optimizer, criterion, epoch):
+    top1 = AverageMeter("top1")
+    top5 = AverageMeter("top5")
+    losses = AverageMeter("losses")
+
+    cur_step = epoch * len(train_loader)
+    cur_lr = optimizer.param_groups[0]["lr"]
+    logger.info("Epoch %d LR %.6f", epoch, cur_lr)
+    writer.add_scalar("lr", cur_lr, global_step=cur_step)
+
+    model.train()
+
+    for step, (x, y) in enumerate(train_loader):
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        bs = x.size(0)
+
+        optimizer.zero_grad()
+        logits, aux_logits = model(x)
+        loss = criterion(logits, y)
+        if config.aux_weight > 0.:
+            loss += config.aux_weight * criterion(aux_logits, y)
+        loss.backward()
+        # gradient clipping
+        nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
+        optimizer.step()
+
+        accuracy = utils.accuracy(logits, y, topk=(1, 5))
+        losses.update(loss.item(), bs)
+        top1.update(accuracy["acc1"], bs)
+        top5.update(accuracy["acc5"], bs)
+        writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
+        writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
+        writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)
+
+        if step % config.log_frequency == 0 or step == len(train_loader) - 1:
+            logger.info(
+                "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
+                "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
+                    epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses,
+                    top1=top1, top5=top5))
+
+        cur_step += 1
+
+    logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
+
+
+def validate(config, valid_loader, model, criterion, epoch, cur_step):
+    top1 = AverageMeter("top1")
+    top5 = AverageMeter("top5")
+    losses = AverageMeter("losses")
+
+    model.eval()
+
+    with torch.no_grad():
+        for step, (X, y) in enumerate(valid_loader):
+            X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
+            bs = X.size(0)
+
+            logits = model(X)
+            loss = criterion(logits, y)
+
+            accuracy = utils.accuracy(logits, y, topk=(1, 5))
+            losses.update(loss.item(), bs)
+            top1.update(accuracy["acc1"], bs)
+            top5.update(accuracy["acc5"], bs)
+
+            if step % config.log_frequency == 0 or step == len(valid_loader) - 1:
+                logger.info(
+                    "Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
+                    "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
+                        epoch + 1, config.epochs, step, len(valid_loader) - 1, losses=losses,
+                        top1=top1, top5=top5))
+
+    writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
+    writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
+    writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)
+
+    logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
+
+    return top1.avg
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("darts")
+    parser.add_argument("--layers", default=20, type=int)
+    parser.add_argument("--batch-size", default=96, type=int)
+    parser.add_argument("--log-frequency", default=10, type=int)
+    parser.add_argument("--epochs", default=600, type=int)
+    parser.add_argument("--aux-weight", default=0.4, type=float)
+    parser.add_argument("--drop-path-prob", default=0.2, type=float)
+    parser.add_argument("--workers", default=4)
+    parser.add_argument("--grad-clip", default=5., type=float)
+    parser.add_argument("--arc-checkpoint", default="./checkpoints/epoch_0.json")
+
+    args = parser.parse_args()
+    dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16)
+
+    model = CNN(32, 3, 36, 10, args.layers, auxiliary=True)
+    apply_fixed_architecture(model, args.arc_checkpoint, device=device)
+    criterion = nn.CrossEntropyLoss()
+
+    model.to(device)
+    criterion.to(device)
+
+    optimizer = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)
+
+    train_loader = torch.utils.data.DataLoader(dataset_train,
+                                               batch_size=args.batch_size,
+                                               shuffle=True,
+                                               num_workers=args.workers,
+                                               pin_memory=True)
+    valid_loader = torch.utils.data.DataLoader(dataset_valid,
+                                               batch_size=args.batch_size,
+                                               shuffle=False,
+                                               num_workers=args.workers,
+                                               pin_memory=True)
+
+    best_top1 = 0.
+    for epoch in range(args.epochs):
+        drop_prob = args.drop_path_prob * epoch / args.epochs
+        model.drop_path_prob(drop_prob)
+
+        # training
+        train(args, train_loader, model, optimizer, criterion, epoch)
+
+        # validation
+        cur_step = (epoch + 1) * len(train_loader)
+        top1 = validate(args, valid_loader, model, criterion, epoch, cur_step)
+        best_top1 = max(best_top1, top1)
+
+        lr_scheduler.step()
+
+    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
--- a/examples/nas/darts/search.py
+++ b/examples/nas/darts/search.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import time
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+
+import datasets
+from model import CNN
+from nni.nas.pytorch.callbacks import ArchitectureCheckpoint, LRSchedulerCallback
+from nni.nas.pytorch.darts import DartsTrainer
+from utils import accuracy
+
+logger = logging.getLogger('nni')
+
+if __name__ == "__main__":
+    parser = ArgumentParser("darts")
+    parser.add_argument("--layers", default=8, type=int)
+    parser.add_argument("--batch-size", default=64, type=int)
+    parser.add_argument("--log-frequency", default=10, type=int)
+    parser.add_argument("--epochs", default=50, type=int)
+    parser.add_argument("--unrolled", default=False, action="store_true")
+    args = parser.parse_args()
+
+    dataset_train, dataset_valid = datasets.get_dataset("cifar10")
+
+    model = CNN(32, 3, 16, 10, args.layers)
+    criterion = nn.CrossEntropyLoss()
+
+    optim = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)
+
+    trainer = DartsTrainer(model,
+                           loss=criterion,
+                           metrics=lambda output, target: accuracy(output, target, topk=(1,)),
+                           optimizer=optim,
+                           num_epochs=args.epochs,
+                           dataset_train=dataset_train,
+                           dataset_valid=dataset_valid,
+                           batch_size=args.batch_size,
+                           log_frequency=args.log_frequency,
+                           unrolled=args.unrolled,
+                           callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")])
+    trainer.train()
--- a/examples/nas/darts/utils.py
+++ b/examples/nas/darts/utils.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+def accuracy(output, target, topk=(1,)):
+    """ Computes the precision@k for the specified values of k """
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    # one-hot case
+    if target.ndimension() > 1:
+        target = target.max(1)[1]
+
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = dict()
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
+    return res
\ No newline at end of file
--- a/examples/nas/enas/datasets.py
+++ b/examples/nas/enas/datasets.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+
+def get_dataset(cls):
+    MEAN = [0.49139968, 0.48215827, 0.44653124]
+    STD = [0.24703233, 0.24348505, 0.26158768]
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD)
+    ]
+
+    train_transform = transforms.Compose(transf + normalize)
+    valid_transform = transforms.Compose(normalize)
+
+    if cls == "cifar10":
+        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
+        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
+    else:
+        raise NotImplementedError
+    return dataset_train, dataset_valid
--- a/examples/nas/enas/macro.py
+++ b/examples/nas/enas/macro.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch.nn as nn
+
+from nni.nas.pytorch import mutables
+from ops import FactorizedReduce, ConvBranch, PoolBranch
+
+
+class ENASLayer(mutables.MutableScope):
+
+    def __init__(self, key, prev_labels, in_filters, out_filters):
+        super().__init__(key)
+        self.in_filters = in_filters
+        self.out_filters = out_filters
+        self.mutable = mutables.LayerChoice([
+            ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False),
+            ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True),
+            ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False),
+            ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True),
+            PoolBranch('avg', in_filters, out_filters, 3, 1, 1),
+            PoolBranch('max', in_filters, out_filters, 3, 1, 1)
+        ])
+        if len(prev_labels) > 0:
+            self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None)
+        else:
+            self.skipconnect = None
+        self.batch_norm = nn.BatchNorm2d(out_filters, affine=False)
+
+    def forward(self, prev_layers):
+        out = self.mutable(prev_layers[-1])
+        if self.skipconnect is not None:
+            connection = self.skipconnect(prev_layers[:-1])
+            if connection is not None:
+                out += connection
+        return self.batch_norm(out)
+
+
+class GeneralNetwork(nn.Module):
+    def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10,
+                 dropout_rate=0.0):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_classes = num_classes
+        self.out_filters = out_filters
+
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_filters)
+        )
+
+        pool_distance = self.num_layers // 3
+        self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1]
+        self.dropout_rate = dropout_rate
+        self.dropout = nn.Dropout(self.dropout_rate)
+
+        self.layers = nn.ModuleList()
+        self.pool_layers = nn.ModuleList()
+        labels = []
+        for layer_id in range(self.num_layers):
+            labels.append("layer_{}".format(layer_id))
+            if layer_id in self.pool_layers_idx:
+                self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters))
+            self.layers.append(ENASLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters))
+
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.dense = nn.Linear(self.out_filters, self.num_classes)
+
+    def forward(self, x):
+        bs = x.size(0)
+        cur = self.stem(x)
+
+        layers = [cur]
+
+        for layer_id in range(self.num_layers):
+            cur = self.layers[layer_id](layers)
+            layers.append(cur)
+            if layer_id in self.pool_layers_idx:
+                for i, layer in enumerate(layers):
+                    layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer)
+                cur = layers[-1]
+
+        cur = self.gap(cur).view(bs, -1)
+        cur = self.dropout(cur)
+        logits = self.dense(cur)
+        return logits
--- a/examples/nas/enas/micro.py
+++ b/examples/nas/enas/micro.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from nni.nas.pytorch import mutables
+from ops import FactorizedReduce, StdConv, SepConvBN, Pool
+
+
+class AuxiliaryHead(nn.Module):
+    def __init__(self, in_channels, num_classes):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.pooling = nn.Sequential(
+            nn.ReLU(),
+            nn.AvgPool2d(5, 3, 2)
+        )
+        self.proj = nn.Sequential(
+            StdConv(in_channels, 128),
+            StdConv(128, 768)
+        )
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(768, 10, bias=False)
+
+    def forward(self, x):
+        bs = x.size(0)
+        x = self.pooling(x)
+        x = self.proj(x)
+        x = self.avg_pool(x).view(bs, -1)
+        x = self.fc(x)
+        return x
+
+
+class Cell(nn.Module):
+    def __init__(self, cell_name, prev_labels, channels):
+        super().__init__()
+        self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True,
+                                                 key=cell_name + "_input")
+        self.op_choice = mutables.LayerChoice([
+            SepConvBN(channels, channels, 3, 1),
+            SepConvBN(channels, channels, 5, 2),
+            Pool("avg", 3, 1, 1),
+            Pool("max", 3, 1, 1),
+            nn.Identity()
+        ], key=cell_name + "_op")
+
+    def forward(self, prev_layers):
+        chosen_input, chosen_mask = self.input_choice(prev_layers)
+        cell_out = self.op_choice(chosen_input)
+        return cell_out, chosen_mask
+
+
+class Node(mutables.MutableScope):
+    def __init__(self, node_name, prev_node_names, channels):
+        super().__init__(node_name)
+        self.cell_x = Cell(node_name + "_x", prev_node_names, channels)
+        self.cell_y = Cell(node_name + "_y", prev_node_names, channels)
+
+    def forward(self, prev_layers):
+        out_x, mask_x = self.cell_x(prev_layers)
+        out_y, mask_y = self.cell_y(prev_layers)
+        return out_x + out_y, mask_x | mask_y
+
+
+class Calibration(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.process = None
+        if in_channels != out_channels:
+            self.process = StdConv(in_channels, out_channels)
+    
+    def forward(self, x):
+        if self.process is None:
+            return x
+        return self.process(x)
+
+
+class ReductionLayer(nn.Module):
+    def __init__(self, in_channels_pp, in_channels_p, out_channels):
+        super().__init__()
+        self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False)
+        self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False)
+    
+    def forward(self, pprev, prev):
+        return self.reduce0(pprev), self.reduce1(prev)
+
+
+class ENASLayer(nn.Module):
+    def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction):
+        super().__init__()
+        self.preproc0 = Calibration(in_channels_pp, out_channels)
+        self.preproc1 = Calibration(in_channels_p, out_channels)
+
+        self.num_nodes = num_nodes
+        name_prefix = "reduce" if reduction else "normal"
+        self.nodes = nn.ModuleList()
+        node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY]
+        for i in range(num_nodes):
+            node_labels.append("{}_node_{}".format(name_prefix, i))
+            self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels))
+        self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1), requires_grad=True)
+        self.bn = nn.BatchNorm2d(out_channels, affine=False)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.kaiming_normal_(self.final_conv_w)
+
+    def forward(self, pprev, prev):
+        pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev)    
+
+        prev_nodes_out = [pprev_, prev_]
+        nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device)
+        for i in range(self.num_nodes):
+            node_out, mask = self.nodes[i](prev_nodes_out)
+            nodes_used_mask[:mask.size(0)] |= mask
+            prev_nodes_out.append(node_out)
+
+        unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1)
+        unused_nodes = F.relu(unused_nodes)
+        conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :]
+        conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1)
+        out = F.conv2d(unused_nodes, conv_weight)
+        return prev, self.bn(out)
+
+
+class MicroNetwork(nn.Module):
+    def __init__(self, num_layers=2, num_nodes=5, out_channels=24, in_channels=3, num_classes=10,
+                 dropout_rate=0.0, use_aux_heads=False):
+        super().__init__()
+        self.num_layers = num_layers
+        self.use_aux_heads = use_aux_heads
+
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels * 3, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels * 3)
+        )
+
+        pool_distance = self.num_layers // 3
+        pool_layers = [pool_distance, 2 * pool_distance + 1]
+        self.dropout = nn.Dropout(dropout_rate)
+
+        self.layers = nn.ModuleList()
+        c_pp = c_p = out_channels * 3
+        c_cur = out_channels
+        for layer_id in range(self.num_layers + 2):
+            reduction = False
+            if layer_id in pool_layers:
+                c_cur, reduction = c_p * 2, True
+                self.layers.append(ReductionLayer(c_pp, c_p, c_cur))
+                c_pp = c_p = c_cur
+            self.layers.append(ENASLayer(num_nodes, c_pp, c_p, c_cur, reduction))
+            if self.use_aux_heads and layer_id == pool_layers[-1] + 1:
+                self.layers.append(AuxiliaryHead(c_cur, num_classes))
+            c_pp, c_p = c_p, c_cur
+
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.dense = nn.Linear(c_cur, num_classes)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        bs = x.size(0)
+        prev = cur = self.stem(x)
+        aux_logits = None
+
+        for layer in self.layers:
+            if isinstance(layer, AuxiliaryHead):
+                if self.training:
+                    aux_logits = layer(cur)
+            else:
+                prev, cur = layer(prev, cur)
+
+        cur = self.gap(F.relu(cur)).view(bs, -1)
+        cur = self.dropout(cur)
+        logits = self.dense(cur)
+
+        if aux_logits is not None:
+            return logits, aux_logits
+        return logits
--- a/examples/nas/enas/ops.py
+++ b/examples/nas/enas/ops.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+
+
+class StdConv(nn.Module):
+    def __init__(self, C_in, C_out):
+        super(StdConv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=False),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class PoolBranch(nn.Module):
+    def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False):
+        super().__init__()
+        self.preproc = StdConv(C_in, C_out)
+        self.pool = Pool(pool_type, kernel_size, stride, padding)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+    def forward(self, x):
+        out = self.preproc(x)
+        out = self.pool(out)
+        out = self.bn(out)
+        return out
+
+
+class SeparableConv(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding):
+        super(SeparableConv, self).__init__()
+        self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride,
+                                   groups=C_in, bias=False)
+        self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        out = self.depthwise(x)
+        out = self.pointwise(out)
+        return out
+
+
+class ConvBranch(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, separable):
+        super(ConvBranch, self).__init__()
+        self.preproc = StdConv(C_in, C_out)
+        if separable:
+            self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding)
+        else:
+            self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding)
+        self.postproc = nn.Sequential(
+            nn.BatchNorm2d(C_out, affine=False),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        out = self.preproc(x)
+        out = self.conv(out)
+        out = self.postproc(out)
+        return out
+
+
+class FactorizedReduce(nn.Module):
+    def __init__(self, C_in, C_out, affine=False):
+        super().__init__()
+        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+    def forward(self, x):
+        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
+
+
+class Pool(nn.Module):
+    def __init__(self, pool_type, kernel_size, stride, padding):
+        super().__init__()
+        if pool_type.lower() == 'max':
+            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
+        elif pool_type.lower() == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
+        else:
+            raise ValueError()
+
+    def forward(self, x):
+        return self.pool(x)
+
+
+class SepConvBN(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, padding):
+        super().__init__()
+        self.relu = nn.ReLU()
+        self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding)
+        self.bn = nn.BatchNorm2d(C_out, affine=True)
+
+    def forward(self, x):
+        x = self.relu(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import time
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+
+import datasets
+from macro import GeneralNetwork
+from micro import MicroNetwork
+from nni.nas.pytorch import enas
+from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
+                                       LRSchedulerCallback)
+from utils import accuracy, reward_accuracy
+
+logger = logging.getLogger('nni')
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("enas")
+    parser.add_argument("--batch-size", default=128, type=int)
+    parser.add_argument("--log-frequency", default=10, type=int)
+    parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
+    args = parser.parse_args()
+
+    dataset_train, dataset_valid = datasets.get_dataset("cifar10")
+    if args.search_for == "macro":
+        model = GeneralNetwork()
+        num_epochs = 310
+        mutator = None
+    elif args.search_for == "micro":
+        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
+        num_epochs = 150
+        mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
+    else:
+        raise AssertionError
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)
+
+    trainer = enas.EnasTrainer(model,
+                               loss=criterion,
+                               metrics=accuracy,
+                               reward_function=reward_accuracy,
+                               optimizer=optimizer,
+                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
+                               batch_size=args.batch_size,
+                               num_epochs=num_epochs,
+                               dataset_train=dataset_train,
+                               dataset_valid=dataset_valid,
+                               log_frequency=args.log_frequency,
+                               mutator=mutator)
+    trainer.train()
--- a/examples/nas/enas/utils.py
+++ b/examples/nas/enas/utils.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+
+
+def accuracy(output, target, topk=(1,)):
+    """ Computes the precision@k for the specified values of k """
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    # one-hot case
+    if target.ndimension() > 1:
+        target = target.max(1)[1]
+
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = dict()
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
+    return res
+
+
+def reward_accuracy(output, target, topk=(1,)):
+    batch_size = target.size(0)
+    _, predicted = torch.max(output.data, 1)
+    return (predicted == target).sum().item() / batch_size