the source code of NNI for DCU

1011377c · qianyj · abc22158 · 1011377c · 1011377c · 1011377c
Commit 1011377c authored Mar 31, 2022 by qianyj
20 changed files
--- a/examples/nas/legacy/textnas/retrain.py
+++ b/examples/nas/legacy/textnas/retrain.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import sys
+import os
+import logging
+import pickle
+import shutil
+import random
+import math
+
+import time
+import datetime
+import argparse
+import distutils.util
+
+import numpy as np
+import torch
+from torch import nn
+from torch import optim
+from torch.utils.data import DataLoader
+import torch.nn.functional as Func
+
+from model import Model
+from nni.nas.pytorch.fixed import apply_fixed_architecture
+from dataloader import read_data_sst
+
+
+logger = logging.getLogger("nni.textnas")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--reset_output_dir",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Whether to clean the output dir if existed. (default: %(default)s)")
+    parser.add_argument(
+        "--child_fixed_arc",
+        type=str,
+        required=True,
+        help="Architecture json file. (default: %(default)s)")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="data",
+        help="Directory containing the dataset and embedding file. (default: %(default)s)")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="The output directory. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_decay_scheme",
+        type=str,
+        default="cosine",
+        help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="Number of samples each batch for training. (default: %(default)s)")
+    parser.add_argument(
+        "--eval_batch_size",
+        type=int,
+        default=128,
+        help="Number of samples each batch for evaluation. (default: %(default)s)")
+    parser.add_argument(
+        "--class_num",
+        type=int,
+        default=5,
+        help="The number of categories. (default: %(default)s)")
+    parser.add_argument(
+        "--global_seed",
+        type=int,
+        default=1234,
+        help="Seed for reproduction. (default: %(default)s)")
+    parser.add_argument(
+        "--max_input_length",
+        type=int,
+        default=64,
+        help="The maximum length of the sentence. (default: %(default)s)")
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=10,
+        help="The number of training epochs. (default: %(default)s)")
+    parser.add_argument(
+        "--child_num_layers",
+        type=int,
+        default=24,
+        help="The layer number of the architecture. (default: %(default)s)")
+    parser.add_argument(
+        "--child_out_filters",
+        type=int,
+        default=256,
+        help="The dimension of hidden states. (default: %(default)s)")
+    parser.add_argument(
+        "--child_out_filters_scale",
+        type=int,
+        default=1,
+        help="The scale of hidden state dimension. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_T_0",
+        type=int,
+        default=10,
+        help="The length of one cycle. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_T_mul",
+        type=int,
+        default=2,
+        help="The multiplication factor per cycle. (default: %(default)s)")
+    parser.add_argument(
+        "--min_count",
+        type=int,
+        default=1,
+        help="The threshold to cut off low frequent words. (default: %(default)s)")
+    parser.add_argument(
+        "--train_ratio",
+        type=float,
+        default=1.0,
+        help="The sample ratio for the training set. (default: %(default)s)")
+    parser.add_argument(
+        "--valid_ratio",
+        type=float,
+        default=1.0,
+        help="The sample ratio for the dev set. (default: %(default)s)")
+    parser.add_argument(
+        "--child_grad_bound",
+        type=float,
+        default=5.0,
+        help="The threshold for gradient clipping. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr",
+        type=float,
+        default=0.02,
+        help="The initial learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--cnn_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for cnn layer. (default: %(default)s)")
+    parser.add_argument(
+        "--final_output_keep_prob",
+        type=float,
+        default=1.0,
+        help="Keep prob for the last output layer. (default: %(default)s)")
+    parser.add_argument(
+        "--lstm_out_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the RNN layer. (default: %(default)s)")
+    parser.add_argument(
+        "--embed_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the embedding layer. (default: %(default)s)")
+    parser.add_argument(
+        "--attention_keep_prob",
+        type=float,
+        default=0.8,
+        help="Keep prob for the self-attention layer. (default: %(default)s)")
+    parser.add_argument(
+        "--child_l2_reg",
+        type=float,
+        default=3e-6,
+        help="Weight decay factor. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_max",
+        type=float,
+        default=0.002,
+        help="The max learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--child_lr_min",
+        type=float,
+        default=0.001,
+        help="The min learning rate. (default: %(default)s)")
+    parser.add_argument(
+        "--child_optim_algo",
+        type=str,
+        default="adam",
+        help="Optimization algorithm. (default: %(default)s)")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default="best_checkpoint",
+        help="Path for saved checkpoints. (default: %(default)s)")
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        default="avg",
+        help="Opertor type for the time steps reduction. (default: %(default)s)")
+    parser.add_argument(
+        "--multi_path",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Search for multiple path in the architecture. (default: %(default)s)")
+    parser.add_argument(
+        "--is_binary",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Binary label for sst dataset. (default: %(default)s)")
+    parser.add_argument(
+        "--is_cuda",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Specify the device type. (default: %(default)s)")
+    parser.add_argument(
+        "--is_mask",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Apply mask. (default: %(default)s)")
+    parser.add_argument(
+        "--fixed_seed",
+        type=distutils.util.strtobool,
+        default=True,
+        help="Fix the seed. (default: %(default)s)")
+    parser.add_argument(
+        "--load_checkpoint",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Wether to load checkpoint. (default: %(default)s)")
+    parser.add_argument(
+        "--log_every",
+        type=int,
+        default=50,
+        help="How many steps to log. (default: %(default)s)")
+    parser.add_argument(
+        "--eval_every_epochs",
+        type=int,
+        default=1,
+        help="How many epochs to eval. (default: %(default)s)")
+
+    global FLAGS
+
+    FLAGS = parser.parse_args()
+
+
+def set_random_seed(seed):
+    logger.info("set random seed for data reading: {}".format(seed))
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if FLAGS.is_cuda:
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+
+
+def get_model(embedding, num_layers):
+    logger.info("num layers: {0}".format(num_layers))
+    assert FLAGS.child_fixed_arc is not None, "Architecture should be provided."
+
+    child_model = Model(
+        embedding=embedding,
+        hidden_units=FLAGS.child_out_filters_scale * FLAGS.child_out_filters,
+        num_layers=num_layers,
+        num_classes=FLAGS.class_num,
+        choose_from_k=5 if FLAGS.multi_path else 1,
+        lstm_keep_prob=FLAGS.lstm_out_keep_prob,
+        cnn_keep_prob=FLAGS.cnn_keep_prob,
+        att_keep_prob=FLAGS.attention_keep_prob,
+        att_mask=FLAGS.is_mask,
+        embed_keep_prob=FLAGS.embed_keep_prob,
+        final_output_keep_prob=FLAGS.final_output_keep_prob,
+        global_pool=FLAGS.output_type)
+
+    apply_fixed_architecture(child_model, FLAGS.child_fixed_arc)
+    return child_model
+
+
+def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
+    if eval_set == "test":
+        assert test_dataloader is not None
+        dataloader = test_dataloader
+    elif eval_set == "valid":
+        assert valid_dataloader is not None
+        dataloader = valid_dataloader
+    else:
+        raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
+
+    tot_acc = 0
+    tot = 0
+    losses = []
+
+    with torch.no_grad():  # save memory
+        for batch in dataloader:
+            (sent_ids, mask), labels = batch
+
+            sent_ids = sent_ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+
+            logits = child_model((sent_ids, mask))  # run
+
+            loss = criterion(logits, labels.long())
+            loss = loss.mean()
+            preds = logits.argmax(dim=1).long()
+            acc = torch.eq(preds, labels.long()).long().sum().item()
+
+            losses.append(loss)
+            tot_acc += acc
+            tot += len(labels)
+
+    losses = torch.tensor(losses)
+    loss = losses.mean()
+    if tot > 0:
+        final_acc = float(tot_acc) / tot
+    else:
+        final_acc = 0
+        logger.info("Error in calculating final_acc")
+    return final_acc, loss
+
+
+def print_user_flags(FLAGS, line_limit=80):
+    log_strings = "\n" + "-" * line_limit + "\n"
+    for flag_name in sorted(vars(FLAGS)):
+        value = "{}".format(getattr(FLAGS, flag_name))
+        log_string = flag_name
+        log_string += "." * (line_limit - len(flag_name) - len(value))
+        log_string += value
+        log_strings = log_strings + log_string
+        log_strings = log_strings + "\n"
+    log_strings += "-" * line_limit
+    logger.info(log_strings)
+
+
+def count_model_params(trainable_params):
+    num_vars = 0
+    for var in trainable_params:
+        num_vars += np.prod([dim for dim in var.size()])
+    return num_vars
+
+
+def update_lr(
+        optimizer,
+        epoch,
+        l2_reg=1e-4,
+        lr_warmup_val=None,
+        lr_init=0.1,
+        lr_decay_scheme="cosine",
+        lr_max=0.002,
+        lr_min=0.000000001,
+        lr_T_0=4,
+        lr_T_mul=1,
+        sync_replicas=False,
+        num_aggregate=None,
+        num_replicas=None):
+    if lr_decay_scheme == "cosine":
+        assert lr_max is not None, "Need lr_max to use lr_cosine"
+        assert lr_min is not None, "Need lr_min to use lr_cosine"
+        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
+        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
+
+        T_i = lr_T_0
+        t_epoch = epoch
+        last_reset = 0
+        while True:
+            t_epoch -= T_i
+            if t_epoch < 0:
+              break
+            last_reset += T_i
+            T_i *= lr_T_mul
+
+        T_curr = epoch - last_reset
+
+        def _update():
+            rate = T_curr / T_i * 3.1415926
+            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate))
+            return lr
+
+        learning_rate = _update()
+    else:
+        raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme))
+
+    #update lr in optimizer
+    for params_group in optimizer.param_groups:
+        params_group['lr'] = learning_rate
+    return learning_rate
+
+
+def train(device, data_path, output_dir, num_layers):
+    logger.info("Build dataloader")
+    train_dataset, valid_dataset, test_dataset, embedding = \
+        read_data_sst(data_path,
+                      FLAGS.max_input_length,
+                      FLAGS.min_count,
+                      train_ratio=FLAGS.train_ratio,
+                      valid_ratio=FLAGS.valid_ratio,
+                      is_binary=FLAGS.is_binary)
+    train_dataloader = DataLoader(train_dataset, batch_size=FLAGS.batch_size, shuffle=True, pin_memory=True)
+    test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
+    valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
+
+    logger.info("Build model")
+    child_model = get_model(embedding, num_layers)
+    logger.info("Finish build model")
+
+    #for name, var in child_model.named_parameters():
+    #    logger.info(name, var.size(), var.requires_grad)  # output all params
+
+    num_vars = count_model_params(child_model.parameters())
+    logger.info("Model has {} params".format(num_vars))
+
+    for m in child_model.modules():  # initializer
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.xavier_uniform_(m.weight)
+
+    criterion = nn.CrossEntropyLoss()
+
+    # get optimizer
+    if FLAGS.child_optim_algo == "adam":
+        optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg)  # with L2
+    else:
+        raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo))
+
+    child_model.to(device)
+    criterion.to(device)
+
+    logger.info("Start training")
+    start_time = time.time()
+    step = 0
+
+    # save path
+    model_save_path = os.path.join(FLAGS.output_dir, "model.pth")
+    best_model_save_path = os.path.join(FLAGS.output_dir, "best_model.pth")
+    best_acc = 0
+    start_epoch = 0
+    if FLAGS.load_checkpoint:
+        if os.path.isfile(model_save_path):
+            checkpoint = torch.load(model_save_path, map_location = torch.device('cpu'))
+            step = checkpoint['step']
+            start_epoch = checkpoint['epoch']
+            child_model.load_state_dict(checkpoint['child_model_state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+
+    for epoch in range(start_epoch, FLAGS.num_epochs):
+        lr = update_lr(optimizer,
+                       epoch,
+                       l2_reg=FLAGS.child_l2_reg,
+                       lr_warmup_val=None,
+                       lr_init=FLAGS.child_lr,
+                       lr_decay_scheme=FLAGS.child_lr_decay_scheme,
+                       lr_max=FLAGS.child_lr_max,
+                       lr_min=FLAGS.child_lr_min,
+                       lr_T_0=FLAGS.child_lr_T_0,
+                       lr_T_mul=FLAGS.child_lr_T_mul)
+        child_model.train()
+        for batch in train_dataloader:
+            (sent_ids, mask), labels = batch
+
+            sent_ids = sent_ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+
+            step += 1
+
+            logits = child_model((sent_ids, mask))  # run
+
+            loss = criterion(logits, labels.long())
+            loss = loss.mean()
+            preds = logits.argmax(dim=1).long()
+            acc = torch.eq(preds, labels.long()).long().sum().item()
+
+            optimizer.zero_grad()
+            loss.backward()
+            grad_norm = 0
+            trainable_params = child_model.parameters()
+
+            assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients."
+            # compute the gradient norm value
+            grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999)
+            for param in trainable_params:
+                nn.utils.clip_grad_norm_(param, FLAGS.child_grad_bound)  # clip grad
+
+            optimizer.step()
+
+            if step % FLAGS.log_every == 0:
+                curr_time = time.time()
+                log_string = ""
+                log_string += "epoch={:<6d}".format(epoch)
+                log_string += "ch_step={:<6d}".format(step)
+                log_string += " loss={:<8.6f}".format(loss)
+                log_string += " lr={:<8.4f}".format(lr)
+                log_string += " |g|={:<8.4f}".format(grad_norm)
+                log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0])
+                log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
+                logger.info(log_string)
+
+        epoch += 1
+        save_state = {
+            'step' : step,
+            'epoch' : epoch,
+            'child_model_state_dict' : child_model.state_dict(),
+            'optimizer_state_dict' : optimizer.state_dict()}
+        torch.save(save_state, model_save_path)
+        child_model.eval()
+        logger.info("Epoch {}: Eval".format(epoch))
+        eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader)
+        logger.info("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
+        if eval_acc > best_acc:
+            best_acc = eval_acc
+            logger.info("Save best model")
+            save_state = {
+                'step' : step,
+                'epoch' : epoch,
+                'child_model_state_dict' : child_model.state_dict(),
+                'optimizer_state_dict' : optimizer.state_dict()}
+            torch.save(save_state, best_model_save_path)
+
+    return eval_acc
+
+
+def main():
+    parse_args()
+    if not os.path.isdir(FLAGS.output_dir):
+        logger.info("Path {} does not exist. Creating.".format(FLAGS.output_dir))
+        os.makedirs(FLAGS.output_dir)
+    elif FLAGS.reset_output_dir:
+        logger.info("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
+        shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
+        os.makedirs(FLAGS.output_dir)
+
+    print_user_flags(FLAGS)
+
+    if FLAGS.fixed_seed:
+        set_random_seed(FLAGS.global_seed)
+
+    device = torch.device("cuda" if FLAGS.is_cuda else "cpu")
+    train(device, FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers)
+
+
+if __name__ == "__main__":
+  main()
--- a/examples/nas/legacy/textnas/run_retrain.sh
+++ b/examples/nas/legacy/textnas/run_retrain.sh
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+export PYTHONPATH="$(pwd)"
+
+python3 -u retrain.py \
+  --train_ratio=1.0 \
+  --valid_ratio=1.0 \
+  --min_count=1 \
+  --is_mask=True \
+  --is_binary=True \
+  --child_lr_decay_scheme="cosine" \
+  --data_path="data" \
+  --class_num=2 \
+  --child_optim_algo="adam" \
+  --output_dir="output_sst2" \
+  --global_seed=1234 \
+  --max_input_length=64 \
+  --batch_size=128 \
+  --eval_batch_size=128 \
+  --num_epochs=10 \
+  --log_every=50 \
+  --eval_every_epochs=1 \
+  --child_num_layers=24 \
+  --child_out_filters=256 \
+  --child_l2_reg=1e-6 \
+  --cnn_keep_prob=0.8 \
+  --final_output_keep_prob=1.0 \
+  --embed_keep_prob=0.8 \
+  --lstm_out_keep_prob=0.8 \
+  --attention_keep_prob=0.8 \
+  --child_lr=0.02 \
+  --child_lr_max=0.002 \
+  --child_lr_min=5e-6 \
+  --child_lr_T_0=10 \
+  --child_lr_T_mul=2 \
+  --multi_path=True \
+  --child_fixed_arc="./arc/final_arc.json" \
+  --fixed_seed=True \
+  "$@"
--- a/examples/nas/legacy/textnas/search.py
+++ b/examples/nas/legacy/textnas/search.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import os
+import random
+from argparse import ArgumentParser
+from itertools import cycle
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from nni.algorithms.nas.pytorch.enas import EnasMutator, EnasTrainer
+from nni.nas.pytorch.callbacks import LRSchedulerCallback
+
+from dataloader import read_data_sst
+from model import Model
+from utils import accuracy
+
+
+logger = logging.getLogger("nni.textnas")
+
+
+class TextNASTrainer(EnasTrainer):
+    def __init__(self, *args, train_loader=None, valid_loader=None, test_loader=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.train_loader = train_loader
+        self.valid_loader = valid_loader
+        self.test_loader = test_loader
+
+    def init_dataloader(self):
+        pass
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("textnas")
+    parser.add_argument("--batch-size", default=128, type=int)
+    parser.add_argument("--log-frequency", default=50, type=int)
+    parser.add_argument("--seed", default=1234, type=int)
+    parser.add_argument("--epochs", default=10, type=int)
+    parser.add_argument("--lr", default=5e-3, type=float)
+    args = parser.parse_args()
+
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+    torch.backends.cudnn.deterministic = True
+
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    train_dataset, valid_dataset, test_dataset, embedding = read_data_sst("data")
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True)
+    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4)
+    train_loader, valid_loader = cycle(train_loader), cycle(valid_loader)
+    model = Model(embedding)
+
+    mutator = EnasMutator(model, temperature=None, tanh_constant=None, entropy_reduction="mean")
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-3, weight_decay=2e-6)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=1e-5)
+
+    trainer = TextNASTrainer(model,
+                             loss=criterion,
+                             metrics=lambda output, target: {"acc": accuracy(output, target)},
+                             reward_function=accuracy,
+                             optimizer=optimizer,
+                             callbacks=[LRSchedulerCallback(lr_scheduler)],
+                             batch_size=args.batch_size,
+                             num_epochs=args.epochs,
+                             dataset_train=None,
+                             dataset_valid=None,
+                             train_loader=train_loader,
+                             valid_loader=valid_loader,
+                             test_loader=test_loader,
+                             log_frequency=args.log_frequency,
+                             mutator=mutator,
+                             mutator_lr=2e-3,
+                             mutator_steps=500,
+                             mutator_steps_aggregate=1,
+                             child_steps=3000,
+                             baseline_decay=0.99,
+                             test_arc_per_epoch=10)
+    trainer.train()
+    os.makedirs("checkpoints", exist_ok=True)
+    for i in range(20):
+        trainer.export(os.path.join("checkpoints", "architecture_%02d.json" % i))
--- a/examples/nas/legacy/textnas/utils.py
+++ b/examples/nas/legacy/textnas/utils.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+
+import torch
+import torch.nn as nn
+
+INF = 1E10
+EPS = 1E-12
+
+logger = logging.getLogger("nni.textnas")
+
+
+def get_length(mask):
+    length = torch.sum(mask, 1)
+    length = length.long().cpu()
+    return length
+
+
+class GlobalAvgPool(nn.Module):
+    def forward(self, x, mask):
+        x = torch.sum(x, 2)
+        length = torch.sum(mask, 1, keepdim=True).float()
+        length += torch.eq(length, 0.0).float() * EPS
+        length = length.repeat(1, x.size()[1])
+        x /= length
+        return x
+
+
+class GlobalMaxPool(nn.Module):
+    def forward(self, x, mask):
+        mask = torch.eq(mask.float(), 0.0).long()
+        mask = torch.unsqueeze(mask, dim=1).repeat(1, x.size()[1], 1)
+        mask *= -INF
+        x += mask
+        x, _ = torch.max(x + mask, 2)
+        return x
+
+
+class IteratorWrapper:
+    def __init__(self, loader):
+        self.loader = loader
+        self.iterator = None
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def __len__(self):
+        return len(self.loader)
+
+    def __next__(self):
+        data = next(self.iterator)
+        text, length = data.text
+        max_length = text.size(1)
+        label = data.label - 1
+        bs = label.size(0)
+        mask = torch.arange(max_length, device=length.device).unsqueeze(0).repeat(bs, 1)
+        mask = mask < length.unsqueeze(-1).repeat(1, max_length)
+        return (text, mask), label
+
+
+def accuracy(output, target):
+    batch_size = target.size(0)
+    _, predicted = torch.max(output.data, 1)
+    return (predicted == target).sum().item() / batch_size
--- a/examples/nas/multi-trial/mnasnet/base_mnasnet.py
+++ b/examples/nas/multi-trial/mnasnet/base_mnasnet.py
+from nni.retiarii import basic_unit
+import nni.retiarii.nn.pytorch as nn
+import warnings
+
+import torch
+import torch.nn as torch_nn
+from torchvision.models.utils import load_state_dict_from_url
+import torch.nn.functional as F
+
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parents[2]))
+
+# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
+# 1.0 - tensorflow.
+_BN_MOMENTUM = 1 - 0.9997
+_FIRST_DEPTH = 32
+_MOBILENET_V2_FILTERS = [16, 24, 32, 64, 96, 160, 320]
+_MOBILENET_V2_NUM_LAYERS = [1, 2, 3, 4, 3, 3, 1]
+
+
+class _ResidualBlock(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+
+    def forward(self, x):
+        return self.net(x) + x
+
+
+class _InvertedResidual(nn.Module):
+
+    def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor, skip, bn_momentum=0.1):
+        super(_InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5]
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = skip and in_ch == out_ch and stride == 1
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2,
+                      stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=bn_momentum))
+
+    def forward(self, input):
+        if self.apply_residual:
+            ret = self.layers(input) + input
+        else:
+            ret = self.layers(input)
+        return ret
+
+
+def _stack_inverted_residual(in_ch, out_ch, kernel_size, skip, stride, exp_factor, repeats, bn_momentum):
+    """ Creates a stack of inverted residuals. """
+    assert repeats >= 1
+    # First one has no skip, because feature map size changes.
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, skip, bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, skip, bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+
+
+def _stack_normal_conv(in_ch, out_ch, kernel_size, skip, dconv, stride, repeats, bn_momentum):
+    assert repeats >= 1
+    stack = []
+    for i in range(repeats):
+        s = stride if i == 0 else 1
+        if dconv:
+            modules = [
+                nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=s, groups=in_ch, bias=False),
+                nn.BatchNorm2d(in_ch, momentum=bn_momentum),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False),
+                nn.BatchNorm2d(out_ch, momentum=bn_momentum)
+            ]
+        else:
+            modules = [
+                nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=s, bias=False),
+                nn.ReLU(inplace=True),
+                nn.BatchNorm2d(out_ch, momentum=bn_momentum)
+            ]
+        if skip and in_ch == out_ch and s == 1:
+            # use different implementation for skip and noskip to align with pytorch
+            stack.append(_ResidualBlock(nn.Sequential(*modules)))
+        else:
+            stack += modules
+        in_ch = out_ch
+    return stack
+
+
+def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
+    """ Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
+    assert 0.0 < round_up_bias < 1.0
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+
+
+def _get_depths(depths, alpha):
+    """ Scales tensor depths as in reference MobileNet code, prefers rouding up
+    rather than down. """
+    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
+
+
+class MNASNet(nn.Module):
+    """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    implements the B1 variant of the model.
+    >>> model = MNASNet(1000, 1.0)
+    >>> x = torch.rand(1, 3, 224, 224)
+    >>> y = model(x)
+    >>> y.dim()
+    1
+    >>> y.nelement()
+    1000
+    """
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
+
+    def __init__(self, alpha, depths, convops, kernel_sizes, num_layers,
+                 skips, num_classes=1000, dropout=0.2):
+        super().__init__()
+        assert alpha > 0.0
+        assert len(depths) == len(convops) == len(kernel_sizes) == len(num_layers) == len(skips) == 7
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _get_depths([_FIRST_DEPTH] + depths, alpha)
+        base_filter_sizes = [16, 24, 40, 80, 96, 192, 320]
+        exp_ratios = [3, 3, 3, 6, 6, 6, 6]
+        strides = [1, 2, 2, 2, 1, 2, 1]
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        count = 0
+        # for conv, prev_depth, depth, ks, skip, stride, repeat, exp_ratio in \
+        #        zip(convops, depths[:-1], depths[1:], kernel_sizes, skips, strides, num_layers, exp_ratios):
+        for filter_size, exp_ratio, stride in zip(base_filter_sizes, exp_ratios, strides):
+            # TODO: restrict that "choose" can only be used within mutator
+            ph = nn.Placeholder(label=f'mutable_{count}', **{
+                'kernel_size_options': [1, 3, 5],
+                'n_layer_options': [1, 2, 3, 4],
+                'op_type_options': ['__mutated__.base_mnasnet.RegularConv',
+                                    '__mutated__.base_mnasnet.DepthwiseConv',
+                                    '__mutated__.base_mnasnet.MobileConv'],
+                # 'se_ratio_options': [0, 0.25],
+                'skip_options': ['identity', 'no'],
+                'n_filter_options': [int(filter_size*x) for x in [0.75, 1.0, 1.25]],
+                'exp_ratio': exp_ratio,
+                'stride': stride,
+                'in_ch': depths[0] if count == 0 else None
+            })
+            layers.append(ph)
+            '''if conv == "mconv":
+                # MNASNet blocks: stacks of inverted residuals.
+                layers.append(_stack_inverted_residual(prev_depth, depth, ks, skip,
+                                                       stride, exp_ratio, repeat, _BN_MOMENTUM))
+            else:
+                # Normal conv and depth-separated conv
+                layers += _stack_normal_conv(prev_depth, depth, ks, skip, conv == "dconv",
+                                             stride, repeat, _BN_MOMENTUM)'''
+            count += 1
+            if count >= 2:
+                break
+        layers += [
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
+                                        nn.Linear(1280, num_classes))
+        self._initialize_weights()
+        #self.for_test = 10
+
+    def forward(self, x):
+        # if self.for_test == 10:
+        x = self.layers(x)
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
+        x = F.relu(x)
+        return self.classifier(x)
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch_nn.init.kaiming_normal_(m.weight, mode="fan_out",
+                                              nonlinearity="relu")
+                if m.bias is not None:
+                    torch_nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                torch_nn.init.ones_(m.weight)
+                torch_nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                torch_nn.init.kaiming_uniform_(m.weight, mode="fan_out",
+                                               nonlinearity="sigmoid")
+                torch_nn.init.zeros_(m.bias)
+
+
+def test_model(model):
+    model(torch.randn(2, 3, 224, 224))
+
+
+# ====================definition of candidate op classes
+BN_MOMENTUM = 1 - 0.9997
+
+
+class RegularConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=stride, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
+
+    def forward(self, x):
+        out = self.bn(self.relu(self.conv(x)))
+        if self.skip == 'identity':
+            out = out + x
+        return out
+
+
+class DepthwiseConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+
+        self.conv1 = nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=in_ch, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_ch, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
+
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        if self.skip == 'identity':
+            out = out + x
+        return out
+
+
+class MobileConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+
+        mid_ch = in_ch * exp_ratio
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=(kernel_size - 1) // 2,
+                      stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM))
+
+    def forward(self, x):
+        out = self.layers(x)
+        if self.skip == 'identity':
+            out = out + x
+        return out
+
+
+# mnasnet0_5
+ir_module = _InvertedResidual(16, 16, 3, 1, 1, True)
--- a/examples/nas/multi-trial/mnasnet/mutator.py
+++ b/examples/nas/multi-trial/mnasnet/mutator.py
+import logging
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parents[2]))
+from nni.retiarii import Mutator
+
+from base_mnasnet import RegularConv, DepthwiseConv, MobileConv
+
+_logger = logging.getLogger(__name__)
+
+class BlockMutator(Mutator):
+    def __init__(self, target: str):
+        super(BlockMutator, self).__init__()
+        self.target = target
+
+    def mutate(self, model):
+        nodes = model.get_nodes_by_label(self.target)
+        assert len(nodes) == 1
+        node = nodes[0]
+        graph = node.graph
+
+        related_info = node.operation.parameters
+        kernel_size = self.choice(related_info['kernel_size_options'])
+        op_type = self.choice(related_info['op_type_options'])
+        #self.choice(related_info['se_ratio_options'])
+        skip = self.choice(related_info['skip_options'])
+        n_filter = self.choice(related_info['n_filter_options'])
+
+        if related_info['in_ch'] is not None:
+            in_ch = related_info['in_ch']
+        else:
+            assert len(node.predecessors) == 1
+            the_node = node.predecessors[0]
+            _logger.debug(repr(the_node.operation.parameters))
+            _logger.debug(the_node.__repr__())
+            in_ch = the_node.operation.parameters['out_ch']
+
+        # update the placeholder to be a new operation
+        node.update_operation(op_type, {
+            'kernel_size': kernel_size,
+            'in_ch': in_ch,
+            'out_ch': n_filter,
+            'skip': 'no',
+            'exp_ratio': related_info['exp_ratio'],
+            'stride': related_info['stride']
+        })
+
+        # insert new nodes after the placeholder
+        n_layer = self.choice(related_info['n_layer_options'])
+        for i in range(1, n_layer):
+            node = graph.insert_node_on_edge(node.outgoing_edges[0],
+                                             '{}_{}'.format(self.target, i),
+                                             op_type,
+                                             {'kernel_size': kernel_size,
+                                              'in_ch': n_filter,
+                                              'out_ch': n_filter,
+                                              'skip': skip,
+                                              'exp_ratio': related_info['exp_ratio'],
+                                              'stride': 1})
+
+        # fix possible shape mismatch
+        # TODO: use formal method function to update parameters
+        if len(node.successors) == 1 and 'in_channels' in node.successors[0].operation.parameters:
+            node.successors[0].operation.parameters['in_channels'] = n_filter
\ No newline at end of file
--- a/examples/nas/multi-trial/mnasnet/search.py
+++ b/examples/nas/multi-trial/mnasnet/search.py
+import os
+import sys
+import torch
+from pathlib import Path
+
+import nni.retiarii.evaluator.pytorch.lightning as pl
+from nni.retiarii import serialize
+from base_mnasnet import MNASNet
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from nni.retiarii.strategy import TPEStrategy
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+from mutator import BlockMutator
+
+if __name__ == '__main__':
+    _DEFAULT_DEPTHS = [16, 24, 40, 80, 96, 192, 320]
+    _DEFAULT_CONVOPS = ["dconv", "mconv", "mconv", "mconv", "mconv", "mconv", "mconv"]
+    _DEFAULT_SKIPS = [False, True, True, True, True, True, True]
+    _DEFAULT_KERNEL_SIZES = [3, 3, 5, 5, 3, 5, 3]
+    _DEFAULT_NUM_LAYERS = [1, 3, 3, 3, 2, 4, 1]
+
+    base_model = MNASNet(0.5, _DEFAULT_DEPTHS, _DEFAULT_CONVOPS, _DEFAULT_KERNEL_SIZES,
+                         _DEFAULT_NUM_LAYERS, _DEFAULT_SKIPS)
+
+    train_transform = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    valid_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = serialize(CIFAR10, root='data/cifar10', train=True, download=True, transform=train_transform)
+    test_dataset = serialize(CIFAR10, root='data/cifar10', train=False, download=True, transform=valid_transform)
+    trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                                val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
+                                max_epochs=1, limit_train_batches=0.2)
+
+    applied_mutators = [
+        BlockMutator('mutable_0'),
+        BlockMutator('mutable_1')
+    ]
+
+    simple_strategy = TPEStrategy()
+
+    exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy)
+
+    exp_config = RetiariiExeConfig('local')
+    exp_config.experiment_name = 'mnasnet_search'
+    exp_config.trial_concurrency = 2
+    exp_config.max_trial_number = 10
+    exp_config.training_service.use_active_gpu = False
+    exp_config.execution_engine = 'base'
+
+    exp.run(exp_config, 8097)
--- a/examples/nas/multi-trial/mnist/search.py
+++ b/examples/nas/multi-trial/mnist/search.py
+import random
+
+import nni
+import torch
+import torch.nn.functional as F
+# remember to import nni.retiarii.nn.pytorch as nn, instead of torch.nn as nn
+import nni.retiarii.nn.pytorch as nn
+import nni.retiarii.strategy as strategy
+from nni.retiarii import model_wrapper
+from nni.retiarii.evaluator import FunctionalEvaluator
+from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment, debug_mutated_model
+from torch.utils.data import DataLoader
+from torchvision import transforms
+from torchvision.datasets import MNIST
+
+
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size=3, groups=in_ch)
+        self.pointwise = nn.Conv2d(in_ch, out_ch, kernel_size=1)
+
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))
+
+
+@model_wrapper
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        # LayerChoice is used to select a layer between Conv2d and DwConv.
+        self.conv2 = nn.LayerChoice([
+            nn.Conv2d(32, 64, 3, 1),
+            DepthwiseSeparableConv(32, 64)
+        ])
+        # ValueChoice is used to select a dropout rate.
+        # ValueChoice can be used as parameter of modules wrapped in `nni.retiarii.nn.pytorch`
+        # or customized modules wrapped with `@basic_unit`.
+        self.dropout1 = nn.Dropout(nn.ValueChoice([0.25, 0.5, 0.75]))
+        self.dropout2 = nn.Dropout(0.5)
+        feature = nn.ValueChoice([64, 128, 256])
+        # Same value choice can be used multiple times
+        self.fc1 = nn.Linear(9216, feature)
+        self.fc2 = nn.Linear(feature, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(self.conv2(x), 2)
+        x = torch.flatten(self.dropout1(x), 1)
+        x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
+        return x
+
+
+def train_epoch(model, device, train_loader, optimizer, epoch):
+    loss_fn = torch.nn.CrossEntropyLoss()
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = loss_fn(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 10 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test_epoch(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    accuracy = 100. * correct / len(test_loader.dataset)
+
+    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
+        correct, len(test_loader.dataset), accuracy))
+
+    return accuracy
+
+
+def evaluate_model(model_cls):
+    # "model_cls" is a class, need to instantiate
+    model = model_cls()
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    transf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = DataLoader(MNIST('data/mnist', download=True, transform=transf), batch_size=64, shuffle=True)
+    test_loader = DataLoader(MNIST('data/mnist', download=True, train=False, transform=transf), batch_size=64)
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    for epoch in range(3):
+        # train the model for one epoch
+        train_epoch(model, device, train_loader, optimizer, epoch)
+        # test the model for one epoch
+        accuracy = test_epoch(model, device, test_loader)
+        # call report intermediate result. Result can be float or dict
+        nni.report_intermediate_result(accuracy)
+
+    # report final test result
+    nni.report_final_result(accuracy)
+
+
+if __name__ == '__main__':
+    base_model = Net()
+
+    search_strategy = strategy.Random()
+    model_evaluator = FunctionalEvaluator(evaluate_model)
+
+    exp = RetiariiExperiment(base_model, model_evaluator, [], search_strategy)
+
+    exp_config = RetiariiExeConfig('local')
+    exp_config.experiment_name = 'mnist_search'
+    exp_config.trial_concurrency = 2
+    exp_config.max_trial_number = 20
+    exp_config.training_service.use_active_gpu = False
+    export_formatter = 'dict'
+
+    # uncomment this for graph-based execution engine
+    # exp_config.execution_engine = 'base'
+    # export_formatter = 'code'
+
+    exp.run(exp_config, 8081 + random.randint(0, 100))
+    print('Final model:')
+    for model_code in exp.export_top_models(formatter=export_formatter):
+        print(model_code)
--- a/examples/nas/multi-trial/nasbench101/base_ops.py
+++ b/examples/nas/multi-trial/nasbench101/base_ops.py
+import math
+
+import torch.nn as nn
+
+
+def truncated_normal_(tensor, mean=0, std=1):
+    # https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
+    size = tensor.shape
+    tmp = tensor.new_empty(size + (4,)).normal_()
+    valid = (tmp < 2) & (tmp > -2)
+    ind = valid.max(-1, keepdim=True)[1]
+    tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+    tensor.data.mul_(std).add_(mean)
+
+
+class ConvBnRelu(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+        super(ConvBnRelu, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_bn_relu = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                fan_in = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                truncated_normal_(m.weight.data, mean=0., std=math.sqrt(1. / fan_in))
+            if isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        return self.conv_bn_relu(x)
+
+
+class Conv3x3BnRelu(ConvBnRelu):
+    def __init__(self, in_channels, out_channels):
+        super(Conv3x3BnRelu, self).__init__(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+
+class Conv1x1BnRelu(ConvBnRelu):
+    def __init__(self, in_channels, out_channels):
+        super(Conv1x1BnRelu, self).__init__(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+
+Projection = Conv1x1BnRelu
--- a/examples/nas/multi-trial/nasbench101/network.py
+++ b/examples/nas/multi-trial/nasbench101/network.py
+import click
+import nni
+import nni.retiarii.evaluator.pytorch.lightning as pl
+import torch.nn as nn
+import torchmetrics
+from nni.retiarii import model_wrapper, serialize
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from nni.retiarii.nn.pytorch import NasBench101Cell
+from nni.retiarii.strategy import Random
+from pytorch_lightning.callbacks import LearningRateMonitor
+from timm.optim import RMSpropTF
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+from base_ops import Conv3x3BnRelu, Conv1x1BnRelu, Projection
+
+
+@model_wrapper
+class NasBench101(nn.Module):
+    def __init__(self,
+                 stem_out_channels: int = 128,
+                 num_stacks: int = 3,
+                 num_modules_per_stack: int = 3,
+                 max_num_vertices: int = 7,
+                 max_num_edges: int = 9,
+                 num_labels: int = 10,
+                 bn_eps: float = 1e-5,
+                 bn_momentum: float = 0.003):
+        super().__init__()
+
+        op_candidates = {
+            'conv3x3-bn-relu': lambda num_features: Conv3x3BnRelu(num_features, num_features),
+            'conv1x1-bn-relu': lambda num_features: Conv1x1BnRelu(num_features, num_features),
+            'maxpool3x3': lambda num_features: nn.MaxPool2d(3, 1, 1)
+        }
+
+        # initial stem convolution
+        self.stem_conv = Conv3x3BnRelu(3, stem_out_channels)
+
+        layers = []
+        in_channels = out_channels = stem_out_channels
+        for stack_num in range(num_stacks):
+            if stack_num > 0:
+                downsample = nn.MaxPool2d(kernel_size=2, stride=2)
+                layers.append(downsample)
+                out_channels *= 2
+            for _ in range(num_modules_per_stack):
+                cell = NasBench101Cell(op_candidates, in_channels, out_channels,
+                                       lambda cin, cout: Projection(cin, cout),
+                                       max_num_vertices, max_num_edges, label='cell')
+                layers.append(cell)
+                in_channels = out_channels
+
+        self.features = nn.ModuleList(layers)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Linear(out_channels, num_labels)
+
+        for module in self.modules():
+            if isinstance(module, nn.BatchNorm2d):
+                module.eps = bn_eps
+                module.momentum = bn_momentum
+
+    def forward(self, x):
+        bs = x.size(0)
+        out = self.stem_conv(x)
+        for layer in self.features:
+            out = layer(out)
+        out = self.gap(out).view(bs, -1)
+        out = self.classifier(out)
+        return out
+
+    def reset_parameters(self):
+        for module in self.modules():
+            if isinstance(module, nn.BatchNorm2d):
+                module.eps = self.config.bn_eps
+                module.momentum = self.config.bn_momentum
+
+
+class AccuracyWithLogits(torchmetrics.Accuracy):
+    def update(self, pred, target):
+        return super().update(nn.functional.softmax(pred), target)
+
+
+@nni.trace
+class NasBench101TrainingModule(pl.LightningModule):
+    def __init__(self, max_epochs=108, learning_rate=0.1, weight_decay=1e-4):
+        super().__init__()
+        self.save_hyperparameters('learning_rate', 'weight_decay', 'max_epochs')
+        self.criterion = nn.CrossEntropyLoss()
+        self.accuracy = AccuracyWithLogits()
+
+    def forward(self, x):
+        y_hat = self.model(x)
+        return y_hat
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = self.criterion(y_hat, y)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_accuracy', self.accuracy(y_hat, y), prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
+        self.log('val_accuracy', self.accuracy(y_hat, y), prog_bar=True)
+
+    def configure_optimizers(self):
+        optimizer = RMSpropTF(self.parameters(), lr=self.hparams.learning_rate,
+                              weight_decay=self.hparams.weight_decay,
+                              momentum=0.9, alpha=0.9, eps=1.0)
+        return {
+            'optimizer': optimizer,
+            'scheduler': CosineAnnealingLR(optimizer, self.hparams.max_epochs)
+        }
+
+    def on_validation_epoch_end(self):
+        nni.report_intermediate_result(self.trainer.callback_metrics['val_accuracy'].item())
+
+    def teardown(self, stage):
+        if stage == 'fit':
+            nni.report_final_result(self.trainer.callback_metrics['val_accuracy'].item())
+
+
+@click.command()
+@click.option('--epochs', default=108, help='Training length.')
+@click.option('--batch_size', default=256, help='Batch size.')
+@click.option('--port', default=8081, help='On which port the experiment is run.')
+@click.option('--benchmark', is_flag=True, default=False)
+def _multi_trial_test(epochs, batch_size, port, benchmark):
+    # initalize dataset. Note that 50k+10k is used. It's a little different from paper
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize([0.49139968, 0.48215827, 0.44653124], [0.24703233, 0.24348505, 0.26158768])
+    ]
+    train_dataset = serialize(CIFAR10, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
+    test_dataset = serialize(CIFAR10, 'data', train=False, transform=transforms.Compose(normalize))
+
+    # specify training hyper-parameters
+    training_module = NasBench101TrainingModule(max_epochs=epochs)
+    # FIXME: need to fix a bug in serializer for this to work
+    # lr_monitor = serialize(LearningRateMonitor, logging_interval='step')
+    trainer = pl.Trainer(max_epochs=epochs, gpus=1)
+    lightning = pl.Lightning(
+        lightning_module=training_module,
+        trainer=trainer,
+        train_dataloader=pl.DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
+        val_dataloaders=pl.DataLoader(test_dataset, batch_size=batch_size),
+    )
+
+    strategy = Random()
+
+    model = NasBench101()
+
+    exp = RetiariiExperiment(model, lightning, [], strategy)
+
+    exp_config = RetiariiExeConfig('local')
+    exp_config.trial_concurrency = 2
+    exp_config.max_trial_number = 20
+    exp_config.trial_gpu_number = 1
+    exp_config.training_service.use_active_gpu = False
+
+    if benchmark:
+        exp_config.benchmark = 'nasbench101'
+        exp_config.execution_engine = 'benchmark'
+
+    exp.run(exp_config, port)
+
+
+if __name__ == '__main__':
+    _multi_trial_test()
--- a/examples/nas/multi-trial/nasbench201/base_ops.py
+++ b/examples/nas/multi-trial/nasbench201/base_ops.py
+import torch
+import torch.nn as nn
+
+
+OPS_WITH_STRIDE = {
+    'none': lambda C_in, C_out, stride: Zero(C_in, C_out, stride),
+    'avg_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'avg'),
+    'max_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'max'),
+    'conv_3x3': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (3, 3), (stride, stride), (1, 1), (1, 1)),
+    'conv_1x1': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (1, 1), (stride, stride), (0, 0), (1, 1)),
+    'skip_connect': lambda C_in, C_out, stride: nn.Identity() if stride == 1 and C_in == C_out
+    else FactorizedReduce(C_in, C_out, stride),
+}
+
+PRIMITIVES = ['none', 'skip_connect', 'conv_1x1', 'conv_3x3', 'avg_pool_3x3']
+
+
+class ReLUConvBN(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
+        super(ReLUConvBN, self).__init__()
+        self.op = nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
+                      padding=padding, dilation=dilation, bias=False),
+            nn.BatchNorm2d(C_out)
+        )
+
+    def forward(self, x):
+        return self.op(x)
+
+
+class SepConv(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
+        super(SepConv, self).__init__()
+        self.op = nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
+                      padding=padding, dilation=dilation, groups=C_in, bias=False),
+            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out),
+        )
+
+    def forward(self, x):
+        return self.op(x)
+
+
+class Pooling(nn.Module):
+    def __init__(self, C_in, C_out, stride, mode):
+        super(Pooling, self).__init__()
+        if C_in == C_out:
+            self.preprocess = None
+        else:
+            self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 1)
+        if mode == 'avg':
+            self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
+        elif mode == 'max':
+            self.op = nn.MaxPool2d(3, stride=stride, padding=1)
+        else:
+            raise ValueError('Invalid mode={:} in Pooling'.format(mode))
+
+    def forward(self, x):
+        if self.preprocess:
+            x = self.preprocess(x)
+        return self.op(x)
+
+
+class Zero(nn.Module):
+    def __init__(self, C_in, C_out, stride):
+        super(Zero, self).__init__()
+        self.C_in = C_in
+        self.C_out = C_out
+        self.stride = stride
+        self.is_zero = True
+
+    def forward(self, x):
+        if self.C_in == self.C_out:
+            if self.stride == 1:
+                return x.mul(0.)
+            else:
+                return x[:, :, ::self.stride, ::self.stride].mul(0.)
+        else:
+            shape = list(x.shape)
+            shape[1] = self.C_out
+            zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
+            return zeros
+
+
+class FactorizedReduce(nn.Module):
+    def __init__(self, C_in, C_out, stride):
+        super(FactorizedReduce, self).__init__()
+        self.stride = stride
+        self.C_in = C_in
+        self.C_out = C_out
+        self.relu = nn.ReLU(inplace=False)
+        if stride == 2:
+            C_outs = [C_out // 2, C_out - C_out // 2]
+            self.convs = nn.ModuleList()
+            for i in range(2):
+                self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
+            self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
+        else:
+            raise ValueError('Invalid stride : {:}'.format(stride))
+        self.bn = nn.BatchNorm2d(C_out)
+
+    def forward(self, x):
+        x = self.relu(x)
+        y = self.pad(x)
+        out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
+
+
+class ResNetBasicblock(nn.Module):
+    def __init__(self, inplanes, planes, stride):
+        super(ResNetBasicblock, self).__init__()
+        assert stride == 1 or stride == 2, 'invalid stride {:}'.format(stride)
+        self.conv_a = ReLUConvBN(inplanes, planes, 3, stride, 1, 1)
+        self.conv_b = ReLUConvBN(planes, planes, 3, 1, 1, 1)
+        if stride == 2:
+            self.downsample = nn.Sequential(
+                nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False))
+        elif inplanes != planes:
+            self.downsample = ReLUConvBN(inplanes, planes, 1, 1, 0, 1)
+        else:
+            self.downsample = None
+        self.in_dim = inplanes
+        self.out_dim = planes
+        self.stride = stride
+        self.num_conv = 2
+
+    def forward(self, inputs):
+        basicblock = self.conv_a(inputs)
+        basicblock = self.conv_b(basicblock)
+
+        if self.downsample is not None:
+            inputs = self.downsample(inputs)  # residual
+        return inputs + basicblock
--- a/examples/nas/multi-trial/nasbench201/network.py
+++ b/examples/nas/multi-trial/nasbench201/network.py
+import click
+import nni
+import nni.retiarii.evaluator.pytorch.lightning as pl
+import torch.nn as nn
+import torchmetrics
+from nni.retiarii import model_wrapper, serialize
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from nni.retiarii.nn.pytorch import NasBench201Cell
+from nni.retiarii.strategy import Random
+from pytorch_lightning.callbacks import LearningRateMonitor
+from timm.optim import RMSpropTF
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torchvision import transforms
+from torchvision.datasets import CIFAR100
+
+from base_ops import ResNetBasicblock, PRIMITIVES, OPS_WITH_STRIDE
+
+
+@model_wrapper
+class NasBench201(nn.Module):
+    def __init__(self,
+                 stem_out_channels: int = 16,
+                 num_modules_per_stack: int = 5,
+                 num_labels: int = 100):
+        super().__init__()
+        self.channels = C = stem_out_channels
+        self.num_modules = N = num_modules_per_stack
+        self.num_labels = num_labels
+
+        self.stem = nn.Sequential(
+            nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(C)
+        )
+
+        layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
+        layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N
+
+        C_prev = C
+        self.cells = nn.ModuleList()
+        for C_curr, reduction in zip(layer_channels, layer_reductions):
+            if reduction:
+                cell = ResNetBasicblock(C_prev, C_curr, 2)
+            else:
+                cell = NasBench201Cell({prim: lambda C_in, C_out: OPS_WITH_STRIDE[prim](C_in, C_out, 1) for prim in PRIMITIVES},
+                                       C_prev, C_curr, label='cell')
+            self.cells.append(cell)
+            C_prev = C_curr
+
+        self.lastact = nn.Sequential(
+            nn.BatchNorm2d(C_prev),
+            nn.ReLU(inplace=True)
+        )
+        self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Linear(C_prev, self.num_labels)
+
+    def forward(self, inputs):
+        feature = self.stem(inputs)
+        for cell in self.cells:
+            feature = cell(feature)
+
+        out = self.lastact(feature)
+        out = self.global_pooling(out)
+        out = out.view(out.size(0), -1)
+        logits = self.classifier(out)
+
+        return logits
+
+
+class AccuracyWithLogits(torchmetrics.Accuracy):
+    def update(self, pred, target):
+        return super().update(nn.functional.softmax(pred), target)
+
+
+@nni.trace
+class NasBench201TrainingModule(pl.LightningModule):
+    def __init__(self, max_epochs=200, learning_rate=0.1, weight_decay=5e-4):
+        super().__init__()
+        self.save_hyperparameters('learning_rate', 'weight_decay', 'max_epochs')
+        self.criterion = nn.CrossEntropyLoss()
+        self.accuracy = AccuracyWithLogits()
+
+    def forward(self, x):
+        y_hat = self.model(x)
+        return y_hat
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = self.criterion(y_hat, y)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_accuracy', self.accuracy(y_hat, y), prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
+        self.log('val_accuracy', self.accuracy(y_hat, y), prog_bar=True)
+
+    def configure_optimizers(self):
+        optimizer = RMSpropTF(self.parameters(), lr=self.hparams.learning_rate,
+                              weight_decay=self.hparams.weight_decay,
+                              momentum=0.9, alpha=0.9, eps=1.0)
+        return {
+            'optimizer': optimizer,
+            'scheduler': CosineAnnealingLR(optimizer, self.hparams.max_epochs)
+        }
+
+    def on_validation_epoch_end(self):
+        nni.report_intermediate_result(self.trainer.callback_metrics['val_accuracy'].item())
+
+    def teardown(self, stage):
+        if stage == 'fit':
+            nni.report_final_result(self.trainer.callback_metrics['val_accuracy'].item())
+
+
+@click.command()
+@click.option('--epochs', default=12, help='Training length.')
+@click.option('--batch_size', default=256, help='Batch size.')
+@click.option('--port', default=8081, help='On which port the experiment is run.')
+@click.option('--benchmark', is_flag=True, default=False)
+def _multi_trial_test(epochs, batch_size, port, benchmark):
+    # initalize dataset. Note that 50k+10k is used. It's a little different from paper
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize([x / 255 for x in [129.3, 124.1, 112.4]], [x / 255 for x in [68.2, 65.4, 70.4]])
+    ]
+    train_dataset = serialize(CIFAR100, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
+    test_dataset = serialize(CIFAR100, 'data', train=False, transform=transforms.Compose(normalize))
+
+    # specify training hyper-parameters
+    training_module = NasBench201TrainingModule(max_epochs=epochs)
+    # FIXME: need to fix a bug in serializer for this to work
+    # lr_monitor = serialize(LearningRateMonitor, logging_interval='step')
+    trainer = pl.Trainer(max_epochs=epochs, gpus=1)
+    lightning = pl.Lightning(
+        lightning_module=training_module,
+        trainer=trainer,
+        train_dataloader=pl.DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
+        val_dataloaders=pl.DataLoader(test_dataset, batch_size=batch_size),
+    )
+
+    strategy = Random()
+
+    model = NasBench201()
+
+    exp = RetiariiExperiment(model, lightning, [], strategy)
+
+    exp_config = RetiariiExeConfig('local')
+    exp_config.trial_concurrency = 2
+    exp_config.max_trial_number = 20
+    exp_config.trial_gpu_number = 1
+    exp_config.training_service.use_active_gpu = False
+
+    if benchmark:
+        exp_config.benchmark = 'nasbench201-cifar100'
+        exp_config.execution_engine = 'benchmark'
+
+    exp.run(exp_config, port)
+
+
+if __name__ == '__main__':
+    _multi_trial_test()
--- a/examples/nas/multi-trial/transformer/README.md
+++ b/examples/nas/multi-trial/transformer/README.md
+# Tuning Transformer with Retiarii
+
+This demo is adapted from PyTorch Transformer tutorial.
+Here, we show how we use functions provided by retiarii to tune Transformer's hyper-parameters, in order to achieve better performance.
+This demo is tested with PyTorch 1.9, torchtext == 0.10, and nni == 2.4.
+Please change the configurations (starting on line 196) accordingly and then run: `python retiarii_transformer_demo.py`
+
+We use a built-in dataset provided by torchtext, WikiText-2, to evaluate Transformer on language modeling. We tune two hyper-parameters: the number of encoder layers (`n_layer`) whose default value in the original paper is 6, and the dropout rate shared by all encoder layers (`p_dropout`) whose default value is 0.1. We report validation perplexity as metric (the lower is better).
+
+We first tune one hyper-parameter with another fixed to the default value. The results are:
+![separate](https://user-images.githubusercontent.com/22978940/136937420-80aecee9-43cc-4f8d-b282-18aec0ad3929.png)
+
+And then we tune these two hyper-parameters jointly. The results are:
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/22978940/136937807-342fde98-6498-4cdd-abdd-4633fd15b7dc.png" width="700">
+</p>
+
+As we can observe, we have found better hyper-parameters (`n_layer = 8`, `p_dropout = 0.2`) than default values. 
+
--- a/examples/nas/multi-trial/transformer/retiarii_transformer_demo.py
+++ b/examples/nas/multi-trial/transformer/retiarii_transformer_demo.py
+###############################################################
+# This demo is adapted from PyTorch Transformer tutorial <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>
+# Here we show how we use functions provided by retiarii to tune Transformer's hyper-parameters,
+# in order to achieve better performance.
+# This demo is tested with PyTorch 1.9, torchtext == 0.10, and nni == 2.4
+import torch
+import torch.nn.functional as F
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+import nni
+import nni.retiarii.strategy as strategy
+from nni.retiarii.evaluator import FunctionalEvaluator
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+
+import math
+
+from torchtext.datasets import WikiText2
+from torchtext.data.utils import get_tokenizer
+from torchtext.vocab import build_vocab_from_iterator
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(max_len, 1, d_model)
+        pe[:, 0, 0::2] = torch.sin(position * div_term)
+        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor, with size [seq_len, batch_size, embedding_dim]
+        """
+        x = x + self.pe[:x.size(0)]
+        return self.dropout(x)
+
+###############################################################
+# PyTorch has already provided modules for Transformer: nn.TransformerEncoderLayer and nn.TransformerEncoder,
+# so we can use them directly, but note that to enable retiarii functions, we need to replace "import torch.nn as nn"
+# with "import nni.retiarii.nn.pytorch as nn".
+#
+# We use nn.ValueChoice to make the number of encoder layers (the default is 6) and the dropout rate mutable. 
+# For other hyper-parameters, we follow the setting in the original paper "Attention is All You Need".
+@model_wrapper # This decorator should be put on the top level module.
+class Transformer(nn.Module):
+    
+    def __init__(self, n_token: int, n_head: int = 8,
+                       d_model: int = 512, d_ff: int = 2048):
+        super().__init__()
+        p_dropout = nn.ValueChoice([0.1, 0.2, 0.3, 0.4, 0.5], label='p_dropout')
+        n_layer = nn.ValueChoice([5, 6, 7, 8, 9], label='n_layer')
+        self.encoder = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model, n_head, d_ff, p_dropout),
+            n_layer
+        )
+        self.d_model = d_model
+        self.decoder = nn.Linear(d_model, n_token)
+        self.embeddings = nn.Embedding(n_token, d_model)
+        self.position = PositionalEncoding(d_model)
+        
+    def forward(self, src, src_mask):
+        """
+        Args:
+            src: Tensor, with size [seq_len, batch_size]
+            src_mask: Tensor, with size [seq_len, seq_len]
+
+        Returns:
+            output: Tensor, with size [seq_len, batch_size, n_token]
+        """
+        src = self.embeddings(src) * math.sqrt(self.d_model)
+        src = self.position(src)
+        output = self.encoder(src, src_mask)
+        output = self.decoder(output)
+        return output
+
+###############################################################
+# We wrap the whole training procedure in the fit function.
+# This function takes one positional argument model_cls which represents one exploration (i.e., one trial).
+# model_cls is automatically generated and passed in by retiarii, and we should instantiate model_cls
+# through model = model_cls()
+def fit(model_cls):
+    
+    train_iter = WikiText2(split='train')
+    tokenizer = get_tokenizer('basic_english')
+    vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
+    vocab.set_default_index(vocab['<unk>'])
+    
+    def process_data(raw_text_iter):
+        """Converts raw text into a flat Tensor."""
+        data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
+        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
+    
+    train_iter, val_iter, _ = WikiText2()
+    train_data = process_data(train_iter)
+    val_data = process_data(val_iter)
+    
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    
+    def generate_batches(data, bsz):
+        """Divides the data into bsz separate sequences."""
+        seq_len = data.size(0) // bsz
+        data = data[:seq_len * bsz]
+        data = data.view(bsz, seq_len).t().contiguous()
+        return data.to(device)
+    
+    batch_size = 20
+    eval_batch_size = 10
+    train_data = generate_batches(train_data, batch_size)
+    val_data = generate_batches(val_data, eval_batch_size)
+
+    seq_len = 35
+    def get_seq(source, i):
+        """
+        Args:
+            source: Tensor, with size [full_seq_len, batch_size]
+            i: int
+            
+        Returns:
+            tuple (data, target): data has size [seq_len, batch_size]
+            and target has size [seq_len * batch_size]
+        """
+        part_len = min(seq_len, len(source) - 1 - i)
+        data = source[i:i+part_len]
+        target = source[i+1:i+1+part_len].reshape(-1)
+        return data, target
+
+    def generate_square_subsequent_mask(sz):
+        """Generates an upper-triangular matrix of -inf, with zeros on diag."""
+        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1) 
+    
+    model = model_cls().to(device)
+    lr = 5.0
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
+
+    def train(model):
+        model.train()
+        src_mask = generate_square_subsequent_mask(seq_len).to(device)
+        for i in range(0, train_data.size(0) - 1, seq_len):
+            data, target = get_seq(train_data, i)
+            part_len = data.size(0)
+            if part_len != seq_len:
+                src_mask = src_mask[:part_len, :part_len]
+            output = model(data, src_mask)
+            loss = F.cross_entropy(output.view(-1, output.size(-1)), target)
+
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+            optimizer.step()
+
+    def evaluate(model, eval_data):
+        model.eval()
+        src_mask = generate_square_subsequent_mask(seq_len).to(device)
+        total_loss = 0.
+        with torch.no_grad():
+            for i in range(0, eval_data.size(0) - 1, seq_len):
+                data, target = get_seq(eval_data, i)
+                part_len = data.size(0)
+                if part_len != seq_len:
+                    src_mask = src_mask[:part_len, :part_len]
+                output = model(data, src_mask)
+                output_flat = output.view(-1, output.size(-1))
+                total_loss += part_len * F.cross_entropy(output_flat, target).item()
+        return total_loss / (len(eval_data) - 1)
+
+    best_val_loss = float('inf')
+    
+    for epoch in range(20):
+        train(model)
+        val_loss = evaluate(model, val_data)
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+        scheduler.step()
+    
+    best_val_ppl = math.exp(best_val_loss)
+    nni.report_final_result(best_val_ppl) # reports best validation ppl to nni as final result of one trial
+
+if __name__ == "__main__":
+    
+    train_iter = WikiText2(split='train')
+    tokenizer = get_tokenizer('basic_english')
+    vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
+    vocab.set_default_index(vocab['<unk>'])
+    
+    n_token = len(vocab)
+    base_model = Transformer(n_token)
+    
+    evaluator = FunctionalEvaluator(fit)
+    exp = RetiariiExperiment(base_model, evaluator, [], strategy.Random())
+    exp_config = RetiariiExeConfig('local')
+    exp_config.experiment_name = 'transformer tuning'
+    exp_config.trial_concurrency = 3 # please change configurations accordingly
+    exp_config.max_trial_number = 25
+    exp_config.trial_gpu_number = 1
+    exp_config.training_service.use_active_gpu = False
+    export_formatter = 'dict'
+    
+    exp.run(exp_config, 8081)
+    print('Final model:')
+    for model_code in exp.export_top_models(optimize_mode='minimize', formatter=export_formatter):
+        print(model_code)
--- a/examples/nas/oneshot/darts/README.md
+++ b/examples/nas/oneshot/darts/README.md
+[Documentation](https://nni.readthedocs.io/en/latest/NAS/DARTS.html)
--- a/examples/nas/oneshot/darts/README_zh_CN.md
+++ b/examples/nas/oneshot/darts/README_zh_CN.md
+[文档](https://nni.readthedocs.io/zh/latest/NAS/DARTS.html)
--- a/examples/nas/oneshot/darts/datasets.py
+++ b/examples/nas/oneshot/darts/datasets.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+
+        return img
+
+
+def get_dataset(cls, cutout_length=0):
+    MEAN = [0.49139968, 0.48215827, 0.44653124]
+    STD = [0.24703233, 0.24348505, 0.26158768]
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD)
+    ]
+    cutout = []
+    if cutout_length > 0:
+        cutout.append(Cutout(cutout_length))
+
+    train_transform = transforms.Compose(transf + normalize + cutout)
+    valid_transform = transforms.Compose(normalize)
+
+    if cls == "cifar10":
+        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
+        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
+    else:
+        raise NotImplementedError
+    return dataset_train, dataset_valid
--- a/examples/nas/oneshot/darts/model.py
+++ b/examples/nas/oneshot/darts/model.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+import ops
+from nni.retiarii.nn.pytorch import LayerChoice, InputChoice
+
+
+class AuxiliaryHead(nn.Module):
+    """ Auxiliary head in 2/3 place of network to let the gradient flow well """
+
+    def __init__(self, input_size, C, n_classes):
+        """ assuming input size 7x7 or 8x8 """
+        assert input_size in [7, 8]
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False),  # 2x2 out
+            nn.Conv2d(C, 128, kernel_size=1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 768, kernel_size=2, bias=False),  # 1x1 out
+            nn.BatchNorm2d(768),
+            nn.ReLU(inplace=True)
+        )
+        self.linear = nn.Linear(768, n_classes)
+
+    def forward(self, x):
+        out = self.net(x)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+        return logits
+
+
+class Node(nn.Module):
+    def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect):
+        super().__init__()
+        self.ops = nn.ModuleList()
+        choice_keys = []
+        for i in range(num_prev_nodes):
+            stride = 2 if i < num_downsample_connect else 1
+            choice_keys.append("{}_p{}".format(node_id, i))
+            self.ops.append(
+                LayerChoice(OrderedDict([
+                    ("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)),
+                    ("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)),
+                    ("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False)),
+                    ("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)),
+                    ("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)),
+                    ("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)),
+                    ("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False))
+                ]), label=choice_keys[-1]))
+        self.drop_path = ops.DropPath()
+        self.input_switch = InputChoice(n_candidates=len(choice_keys), n_chosen=2, label="{}_switch".format(node_id))
+
+    def forward(self, prev_nodes):
+        assert len(self.ops) == len(prev_nodes)
+        out = [op(node) for op, node in zip(self.ops, prev_nodes)]
+        out = [self.drop_path(o) if o is not None else None for o in out]
+        return self.input_switch(out)
+
+
+class Cell(nn.Module):
+
+    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
+        super().__init__()
+        self.reduction = reduction
+        self.n_nodes = n_nodes
+
+        # If previous cell is reduction cell, current input size does not match with
+        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
+        if reduction_p:
+            self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
+        else:
+            self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
+        self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)
+
+        # generate dag
+        self.mutable_ops = nn.ModuleList()
+        for depth in range(2, self.n_nodes + 2):
+            self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth),
+                                         depth, channels, 2 if reduction else 0))
+
+    def forward(self, s0, s1):
+        # s0, s1 are the outputs of previous previous cell and previous cell, respectively.
+        tensors = [self.preproc0(s0), self.preproc1(s1)]
+        for node in self.mutable_ops:
+            cur_tensor = node(tensors)
+            tensors.append(cur_tensor)
+
+        output = torch.cat(tensors[2:], dim=1)
+        return output
+
+
+class CNN(nn.Module):
+
+    def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
+                 stem_multiplier=3, auxiliary=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.channels = channels
+        self.n_classes = n_classes
+        self.n_layers = n_layers
+        self.aux_pos = 2 * n_layers // 3 if auxiliary else -1
+
+        c_cur = stem_multiplier * self.channels
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(c_cur)
+        )
+
+        # for the first cell, stem is used for both s0 and s1
+        # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
+        channels_pp, channels_p, c_cur = c_cur, c_cur, channels
+
+        self.cells = nn.ModuleList()
+        reduction_p, reduction = False, False
+        for i in range(n_layers):
+            reduction_p, reduction = reduction, False
+            # Reduce featuremap size and double channels in 1/3 and 2/3 layer.
+            if i in [n_layers // 3, 2 * n_layers // 3]:
+                c_cur *= 2
+                reduction = True
+
+            cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction)
+            self.cells.append(cell)
+            c_cur_out = c_cur * n_nodes
+            channels_pp, channels_p = channels_p, c_cur_out
+
+            if i == self.aux_pos:
+                self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)
+
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.linear = nn.Linear(channels_p, n_classes)
+
+    def forward(self, x):
+        s0 = s1 = self.stem(x)
+
+        aux_logits = None
+        for i, cell in enumerate(self.cells):
+            s0, s1 = s1, cell(s0, s1)
+            if i == self.aux_pos and self.training:
+                aux_logits = self.aux_head(s1)
+
+        out = self.gap(s1)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+
+        if aux_logits is not None:
+            return logits, aux_logits
+        return logits
+
+    def drop_path_prob(self, p):
+        for module in self.modules():
+            if isinstance(module, ops.DropPath):
+                module.p = p
--- a/examples/nas/oneshot/darts/ops.py
+++ b/examples/nas/oneshot/darts/ops.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+
+
+class DropPath(nn.Module):
+    def __init__(self, p=0.):
+        """
+        Drop path with probability.
+
+        Parameters
+        ----------
+        p : float
+            Probability of an path to be zeroed.
+        """
+        super().__init__()
+        self.p = p
+
+    def forward(self, x):
+        if self.training and self.p > 0.:
+            keep_prob = 1. - self.p
+            # per data point mask
+            mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
+            return x / keep_prob * mask
+
+        return x
+
+
+class PoolBN(nn.Module):
+    """
+    AvgPool or MaxPool with BN. `pool_type` must be `max` or `avg`.
+    """
+    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        if pool_type.lower() == 'max':
+            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
+        elif pool_type.lower() == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
+        else:
+            raise ValueError()
+
+        self.bn = nn.BatchNorm2d(C, affine=affine)
+
+    def forward(self, x):
+        out = self.pool(x)
+        out = self.bn(out)
+        return out
+
+
+class StdConv(nn.Module):
+    """
+    Standard conv: ReLU - Conv - BN
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FacConv(nn.Module):
+    """
+    Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN
+    """
+    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
+            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class DilConv(nn.Module):
+    """
+    (Dilated) depthwise separable conv.
+    ReLU - (Dilated) depthwise separable - Pointwise - BN.
+    If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field.
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
+                      bias=False),
+            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SepConv(nn.Module):
+    """
+    Depthwise separable conv.
+    DilConv(dilation=1) * 2.
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
+            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FactorizedReduce(nn.Module):
+    """
+    Reduce feature map size by factorized pointwise (stride=2).
+    """
+    def __init__(self, C_in, C_out, affine=True):
+        super().__init__()
+        self.relu = nn.ReLU()
+        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+    def forward(self, x):
+        x = self.relu(x)
+        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
--- a/examples/nas/oneshot/darts/retrain.py
+++ b/examples/nas/oneshot/darts/retrain.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import time
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+from torch.utils.tensorboard import SummaryWriter
+
+import datasets
+import utils
+from model import CNN
+from nni.nas.pytorch.utils import AverageMeter
+from nni.retiarii import fixed_arch
+
+logger = logging.getLogger('nni')
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+writer = SummaryWriter()
+
+
+def train(config, train_loader, model, optimizer, criterion, epoch):
+    top1 = AverageMeter("top1")
+    top5 = AverageMeter("top5")
+    losses = AverageMeter("losses")
+
+    cur_step = epoch * len(train_loader)
+    cur_lr = optimizer.param_groups[0]["lr"]
+    logger.info("Epoch %d LR %.6f", epoch, cur_lr)
+    writer.add_scalar("lr", cur_lr, global_step=cur_step)
+
+    model.train()
+
+    for step, (x, y) in enumerate(train_loader):
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        bs = x.size(0)
+
+        optimizer.zero_grad()
+        logits, aux_logits = model(x)
+        loss = criterion(logits, y)
+        if config.aux_weight > 0.:
+            loss += config.aux_weight * criterion(aux_logits, y)
+        loss.backward()
+        # gradient clipping
+        nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
+        optimizer.step()
+
+        accuracy = utils.accuracy(logits, y, topk=(1, 5))
+        losses.update(loss.item(), bs)
+        top1.update(accuracy["acc1"], bs)
+        top5.update(accuracy["acc5"], bs)
+        writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
+        writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
+        writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)
+
+        if step % config.log_frequency == 0 or step == len(train_loader) - 1:
+            logger.info(
+                "Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
+                "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
+                    epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses,
+                    top1=top1, top5=top5))
+
+        cur_step += 1
+
+    logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
+
+
+def validate(config, valid_loader, model, criterion, epoch, cur_step):
+    top1 = AverageMeter("top1")
+    top5 = AverageMeter("top5")
+    losses = AverageMeter("losses")
+
+    model.eval()
+
+    with torch.no_grad():
+        for step, (X, y) in enumerate(valid_loader):
+            X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
+            bs = X.size(0)
+
+            logits = model(X)
+            loss = criterion(logits, y)
+
+            accuracy = utils.accuracy(logits, y, topk=(1, 5))
+            losses.update(loss.item(), bs)
+            top1.update(accuracy["acc1"], bs)
+            top5.update(accuracy["acc5"], bs)
+
+            if step % config.log_frequency == 0 or step == len(valid_loader) - 1:
+                logger.info(
+                    "Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
+                    "Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
+                        epoch + 1, config.epochs, step, len(valid_loader) - 1, losses=losses,
+                        top1=top1, top5=top5))
+
+    writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
+    writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
+    writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)
+
+    logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
+
+    return top1.avg
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("darts")
+    parser.add_argument("--layers", default=20, type=int)
+    parser.add_argument("--batch-size", default=96, type=int)
+    parser.add_argument("--log-frequency", default=10, type=int)
+    parser.add_argument("--epochs", default=600, type=int)
+    parser.add_argument("--aux-weight", default=0.4, type=float)
+    parser.add_argument("--drop-path-prob", default=0.2, type=float)
+    parser.add_argument("--workers", default=4)
+    parser.add_argument("--grad-clip", default=5., type=float)
+    parser.add_argument("--arc-checkpoint", default="./checkpoints/epoch_0.json")
+
+    args = parser.parse_args()
+    dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16)
+
+    with fixed_arch(args.arc_checkpoint):
+        model = CNN(32, 3, 36, 10, args.layers, auxiliary=True)
+    criterion = nn.CrossEntropyLoss()
+
+    model.to(device)
+    criterion.to(device)
+
+    optimizer = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
+    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)
+
+    train_loader = torch.utils.data.DataLoader(dataset_train,
+                                               batch_size=args.batch_size,
+                                               shuffle=True,
+                                               num_workers=args.workers,
+                                               pin_memory=True)
+    valid_loader = torch.utils.data.DataLoader(dataset_valid,
+                                               batch_size=args.batch_size,
+                                               shuffle=False,
+                                               num_workers=args.workers,
+                                               pin_memory=True)
+
+    best_top1 = 0.
+    for epoch in range(args.epochs):
+        drop_prob = args.drop_path_prob * epoch / args.epochs
+        model.drop_path_prob(drop_prob)
+
+        # training
+        train(args, train_loader, model, optimizer, criterion, epoch)
+
+        # validation
+        cur_step = (epoch + 1) * len(train_loader)
+        top1 = validate(args, valid_loader, model, criterion, epoch, cur_step)
+        best_top1 = max(best_top1, top1)
+
+        lr_scheduler.step()
+
+    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))