Resolve conflicts for #4760 (#4762)

a911b856 · Yuge Zhang · GitHub · 14d2966b · a911b856 · a911b856
Unverified Commit a911b856 authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
20 changed files
--- a/examples/nas/oneshot/pfld/train.py
+++ b/examples/nas/oneshot/pfld/train.py
@@ -57,6 +57,8 @@ def main(args):

    # the configuration for training control
    nas_config = NASConfig(
+        perf_metric=args.perf_metric,
+        lut_load=args.lut_load,
        model_dir=args.snapshot,
        nas_lr=args.theta_lr,
        mode=args.mode,
@@ -150,6 +152,15 @@ def main(args):


 def parse_args():
+    def str2bool(s):
+        if isinstance(s, bool):
+            return s
+        if s.lower() in ('yes', 'true', 't', 'y', '1'):
+            return True
+        if s.lower() in ('no', 'false', 'f', 'n', '0'):
+            return False
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
    """ Parse the user arguments. """
    parser = argparse.ArgumentParser(description="FBNet for PFLD")
    parser.add_argument("--dev_id", dest="dev_id", default="0", type=str)
@@ -174,6 +185,16 @@ def parse_args():
    )
    parser.add_argument("--train_batchsize", default=256, type=int)
    parser.add_argument("--val_batchsize", default=128, type=int)
+    parser.add_argument(
+        "--perf_metric", default="flops", type=str, choices=["flops", "latency"]
+    )
+    parser.add_argument(
+        "--lut_load", type=str2bool, default=False
+    )
+    parser.add_argument(
+        "--lut_load_format", default="json", type=str, choices=["json", "numpy"]
+    )
+
    args = parser.parse_args()
    args.snapshot = os.path.join(args.snapshot, 'supernet')
    args.log_file = os.path.join(args.snapshot, "{}.log".format('supernet'))

--- a/examples/nas/oneshot/spos/network.py
+++ b/examples/nas/oneshot/spos/network.py
@@ -8,10 +8,12 @@ import re
 import torch
 import nni.retiarii.nn.pytorch as nn
 from nni.retiarii.nn.pytorch import LayerChoice
+from nni.retiarii.serializer import model_wrapper

 from blocks import ShuffleNetBlock, ShuffleXceptionBlock


+@model_wrapper
 class ShuffleNetV2OneShot(nn.Module):
    block_keys = [
        'shufflenet_3x3',

--- a/examples/nas/oneshot/spos/search.py
+++ b/examples/nas/oneshot/spos/search.py
-# This file is to demo the usage of multi-trial NAS in the usage of SPOS search space.
-
-import click
+import time
 import json
-import nni.retiarii.evaluator.pytorch as pl
+import random
+import logging
+import argparse
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import nni
+from nn_meter import load_latency_predictor
+import nni.retiarii.nn.pytorch as nn
 import nni.retiarii.strategy as strategy
-from nni.retiarii import serialize
+from nni.retiarii.evaluator.functional import FunctionalEvaluator
+from nni.retiarii.utils import original_state_dict_hooks
+from nni.retiarii.oneshot.pytorch.utils import AverageMeterGroup
 from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-from nn_meter import load_latency_predictor

-from network import ShuffleNetV2OneShot
-from utils import get_archchoice_by_model
+from network import ShuffleNetV2OneShot, load_and_parse_state_dict
+from utils import CrossEntropyLabelSmooth, accuracy, ToBGRTensor, get_archchoice_by_model
+
+logger = logging.getLogger("nni.spos.search")
+
+
+def retrain_bn(model, criterion, max_iters, log_freq, loader):
+    with torch.no_grad():
+        logger.info("Clear BN statistics...")
+        for m in model.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.running_mean = torch.zeros_like(m.running_mean)
+                m.running_var = torch.ones_like(m.running_var)
+
+        logger.info("Train BN with training set (BN sanitize)...")
+        model.train()
+        meters = AverageMeterGroup()
+        for step in range(max_iters):
+            inputs, targets = next(iter(loader))
+            inputs, targets = inputs.to('cuda'), targets.to('cuda')
+            logits = model(inputs)
+            loss = criterion(logits, targets)
+            metrics = accuracy(logits, targets)
+            metrics["loss"] = loss.item()
+            meters.update(metrics)
+            if step % log_freq == 0 or step + 1 == max_iters:
+                logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters)
+
+
+def test_acc(model, criterion, log_freq, loader):
+    logger.info("Start testing...")
+    model.eval()
+    meters = AverageMeterGroup()
+    start_time = time.time()
+    with torch.no_grad():
+        for step, (inputs, targets) in enumerate(loader):
+            inputs, targets = inputs.to('cuda'), targets.to('cuda')
+            logits = model(inputs)
+            loss = criterion(logits, targets)
+            metrics = accuracy(logits, targets)
+            metrics["loss"] = loss.item()
+            meters.update(metrics)
+            if step % log_freq == 0 or step + 1 == len(loader):
+                logger.info("Valid Step [%d/%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f",
+                            step + 1, len(loader), time.time() - start_time,
+                            meters.acc1.avg, meters.acc5.avg, meters.loss.avg)
+    return meters.acc1.avg
+
+
+def evaluate_acc(class_cls, criterion, args):
+    model = class_cls()
+    with original_state_dict_hooks(model):
+        model.load_state_dict(load_and_parse_state_dict(args.checkpoint), strict=False)
+    model.cuda()
+
+    if args.spos_preprocessing:
+        train_trans = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
+            transforms.RandomHorizontalFlip(0.5),
+            ToBGRTensor()
+        ])
+    else:
+        train_trans = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.ToTensor()
+        ])
+    val_trans = transforms.Compose([
+        transforms.RandomResizedCrop(224),
+        ToBGRTensor()
+        ])
+    train_dataset = datasets.ImageNet(args.imagenet_dir, split='train', transform=train_trans)
+    val_dataset = datasets.ImageNet(args.imagenet_dir, split='val', transform=val_trans)
+
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, num_workers=args.workers, shuffle=True)
+    test_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.test_batch_size, num_workers=args.workers, shuffle=True)
+
+    acc_before = test_acc(model, criterion, args.log_frequency, test_loader)
+    nni.report_intermediate_result(acc_before)
+
+    retrain_bn(model, criterion, args.train_iters, args.log_frequency, train_loader)
+    acc = test_acc(model, criterion, args.log_frequency, test_loader)
+    assert isinstance(acc, float)
+    nni.report_intermediate_result(acc)
+    nni.report_final_result(acc)


 class LatencyFilter:
    def __init__(self, threshold, predictor, predictor_version=None, reverse=False):
        """
-        Filter the models according to predicted latency.
+        Filter the models according to predicted latency. If the predicted latency of the ir model is larger than
+        the given threshold, the ir model will be filtered and will not be considered as a searched architecture.

        Parameters
        ----------
@@ -37,42 +128,62 @@ class LatencyFilter:
        return latency < self.threshold


-@click.command()
-@click.option('--port', default=8081, help='On which port the experiment is run.')
-def _main(port):
-    base_model = ShuffleNetV2OneShot(32)
-    base_predictor = 'cortexA76cpu_tflite21'
-    transf = [
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip()
-    ]
-    normalize = [
-        transforms.ToTensor(),
-        transforms.Normalize([0.49139968, 0.48215827, 0.44653124], [0.24703233, 0.24348505, 0.26158768])
-    ]
-    # FIXME
-    # CIFAR10 is used here temporarily.
-    # Actually we should load weight from supernet and evaluate on imagenet.
-    train_dataset = serialize(CIFAR10, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
-    test_dataset = serialize(CIFAR10, 'data', train=False, transform=transforms.Compose(normalize))
-
-    trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=64),
-                                val_dataloaders=pl.DataLoader(test_dataset, batch_size=64),
-                                max_epochs=2, gpus=1)
-
-    simple_strategy = strategy.RegularizedEvolution(model_filter=LatencyFilter(threshold=100, predictor=base_predictor),
-                                                    sample_size=1, population_size=2, cycles=2)
-    exp = RetiariiExperiment(base_model, trainer, strategy=simple_strategy)
+def _main():
+    parser = argparse.ArgumentParser("SPOS Evolutional Search")
+    parser.add_argument("--port", type=int, default=8084)
+    parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet")
+    parser.add_argument("--checkpoint", type=str, default="./data/checkpoint-150000.pth.tar")
+    parser.add_argument("--spos-preprocessing", action="store_true", default=False,
+                        help="When true, image values will range from 0 to 255 and use BGR "
+                             "(as in original repo).")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--workers", type=int, default=6)
+    parser.add_argument("--train-batch-size", type=int, default=128)
+    parser.add_argument("--train-iters", type=int, default=200)
+    parser.add_argument("--test-batch-size", type=int, default=512)
+    parser.add_argument("--log-frequency", type=int, default=10)
+    parser.add_argument("--label-smoothing", type=float, default=0.1)
+    parser.add_argument("--evolution-sample-size", type=int, default=10)
+    parser.add_argument("--evolution-population-size", type=int, default=50)
+    parser.add_argument("--evolution-cycles", type=int, default=10)
+    parser.add_argument("--latency-filter", type=str, default=None,
+                        help="Apply latency filter by calling the name of the applied hardware.")
+    parser.add_argument("--latency-threshold", type=float, default=100)
+
+    args = parser.parse_args()
+
+    # use a fixed set of image will improve the performance
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+    torch.backends.cudnn.deterministic = True
+
+    assert torch.cuda.is_available()
+
+    base_model = ShuffleNetV2OneShot()
+    criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing)
+
+    if args.latency_filter:
+        latency_filter = LatencyFilter(threshold=args.latency_threshold, predictor=args.latency_filter)
+    else:
+        latency_filter = None
+
+    evaluator = FunctionalEvaluator(evaluate_acc, criterion=criterion, args=args)
+    evolution_strategy = strategy.RegularizedEvolution(
+        model_filter=latency_filter,
+        sample_size=args.evolution_sample_size, population_size=args.evolution_population_size, cycles=args.evolution_cycles)
+    exp = RetiariiExperiment(base_model, evaluator, strategy=evolution_strategy)

    exp_config = RetiariiExeConfig('local')
    exp_config.trial_concurrency = 2
-    # exp_config.max_trial_number = 2
    exp_config.trial_gpu_number = 1
+    exp_config.max_trial_number = args.evolution_cycles
    exp_config.training_service.use_active_gpu = False
    exp_config.execution_engine = 'base'
-    exp_config.dummy_input = [1, 3, 32, 32]
+    exp_config.dummy_input = [1, 3, 224, 224]

-    exp.run(exp_config, port)
+    exp.run(exp_config, args.port)

    print('Exported models:')
    for i, model in enumerate(exp.export_top_models(formatter='dict')):
@@ -80,6 +191,5 @@ def _main(port):
        with open(f'architecture_final_{i}.json', 'w') as f: 
            json.dump(get_archchoice_by_model(model), f, indent=4)

-
-if __name__ == '__main__':
+if __name__ == "__main__":
    _main()
--- a/examples/nas/oneshot/spos/supernet.py
+++ b/examples/nas/oneshot/spos/supernet.py
@@ -47,10 +47,7 @@ if __name__ == "__main__":
    if args.load_checkpoint:
        if not args.spos_preprocessing:
            logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.")
-        # load state_dict and 
-        model_dict = model.state_dict()
-        model_dict.update(load_and_parse_state_dict())
-        model.load_state_dict(model_dict)
+        model.load_state_dict(load_and_parse_state_dict(), strict=False)
        logger.info(f'Model loaded from ./data/checkpoint-150000.pth.tar')
    model.cuda()
    if torch.cuda.device_count() > 1:  # exclude last gpu, saving for data preprocessing on gpu

--- a/examples/nas/oneshot/spos/utils.py
+++ b/examples/nas/oneshot/spos/utils.py
@@ -57,8 +57,13 @@ class ToBGRTensor(object):


 def get_archchoice_by_model(model):
+    """
+    For ``exp_config.execution_engine = 'base'``, the top model exported by ``exp.export_top_models`` is a dict with format as
+    ``"LayerChoice1": "layerchoice_LayerChoice1_0"``. However when loading architecture from ``fixed_arch``, the dict value is needed 
+    to be converted to int. This function removed "layerchoice" string in choice value to meet the requirement of ``fixed_arch``.
+    The output will be ``"LayerChoice1": "0"``.
+    """
    result = {}
    for k, v in model.items():
-        assert k in v
        result[k] = model[k].split("_")[-1]
    return result
--- a/examples/trials/ga_squad/requirements.txt
+++ b/examples/trials/ga_squad/requirements.txt
-tensorflow==1.15.4
+tensorflow  # tested version: 1.15.4
--- a/examples/trials/network_morphism/requirements.txt
+++ b/examples/trials/network_morphism/requirements.txt
-numpy==1.18.5
-tensorflow==1.15.4
-torchvision==0.2.1
-Keras==2.3.1
-torch==0.4.1
+numpy  # tested version: 1.18.5
+tensorflow  # tested version: 1.15.4
+torchvision  # tested version: 0.2.1
+Keras  # tested version: 2.3.1
+torch  # tested version: 0.4.1
--- a/examples/tutorials/.gitignore
+++ b/examples/tutorials/.gitignore
+data/
+log/
+*.onnx
\ No newline at end of file
--- a/examples/tutorials/hello_nas.py
+++ b/examples/tutorials/hello_nas.py
+"""
+Hello, NAS!
+===========
+
+This is the 101 tutorial of Neural Architecture Search (NAS) on NNI.
+In this tutorial, we will search for a neural architecture on MNIST dataset with the help of NAS framework of NNI, i.e., *Retiarii*.
+We use multi-trial NAS as an example to show how to construct and explore a model space.
+
+There are mainly three crucial components for a neural architecture search task, namely,
+
+* Model search space that defines a set of models to explore.
+* A proper strategy as the method to explore this model space.
+* A model evaluator that reports the performance of every model in the space.
+
+Currently, PyTorch is the only supported framework by Retiarii, and we have only tested **PyTorch 1.7 to 1.10**.
+This tutorial assumes PyTorch context but it should also apply to other frameworks, which is in our future plan.
+
+Define your Model Space
+-----------------------
+
+Model space is defined by users to express a set of models that users want to explore, which contains potentially good-performing models.
+In this framework, a model space is defined with two parts: a base model and possible mutations on the base model.
+"""
+
+# %%
+#
+# Define Base Model
+# ^^^^^^^^^^^^^^^^^
+#
+# Defining a base model is almost the same as defining a PyTorch (or TensorFlow) model.
+# Usually, you only need to replace the code ``import torch.nn as nn`` with
+# ``import nni.retiarii.nn.pytorch as nn`` to use our wrapped PyTorch modules.
+#
+# Below is a very simple example of defining a base model.
+
+import torch
+import torch.nn.functional as F
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+
+
+@model_wrapper      # this decorator should be put on the out most
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(self.conv2(x), 2)
+        x = torch.flatten(self.dropout1(x), 1)
+        x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
+        output = F.log_softmax(x, dim=1)
+        return output
+
+# %%
+# .. tip:: Always keep in mind that you should use ``import nni.retiarii.nn.pytorch as nn`` and :meth:`nni.retiarii.model_wrapper`.
+#          Many mistakes are a result of forgetting one of those.
+#          Also, please use ``torch.nn`` for submodules of ``nn.init``, e.g., ``torch.nn.init`` instead of ``nn.init``.
+#
+# Define Model Mutations
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# A base model is only one concrete model not a model space. We provide :doc:`API and Primitives </nas/construct_space>`
+# for users to express how the base model can be mutated. That is, to build a model space which includes many models.
+#
+# Based on the above base model, we can define a model space as below.
+#
+# .. code-block:: diff
+#
+#   @model_wrapper
+#   class Net(nn.Module):
+#     def __init__(self):
+#       super().__init__()
+#       self.conv1 = nn.Conv2d(1, 32, 3, 1)
+#   -   self.conv2 = nn.Conv2d(32, 64, 3, 1)
+#   +   self.conv2 = nn.LayerChoice([
+#   +       nn.Conv2d(32, 64, 3, 1),
+#   +       DepthwiseSeparableConv(32, 64)
+#   +   ])
+#   -   self.dropout1 = nn.Dropout(0.25)
+#   +   self.dropout1 = nn.Dropout(nn.ValueChoice([0.25, 0.5, 0.75]))
+#       self.dropout2 = nn.Dropout(0.5)
+#   -   self.fc1 = nn.Linear(9216, 128)
+#   -   self.fc2 = nn.Linear(128, 10)
+#   +   feature = nn.ValueChoice([64, 128, 256])
+#   +   self.fc1 = nn.Linear(9216, feature)
+#   +   self.fc2 = nn.Linear(feature, 10)
+#
+#     def forward(self, x):
+#       x = F.relu(self.conv1(x))
+#       x = F.max_pool2d(self.conv2(x), 2)
+#       x = torch.flatten(self.dropout1(x), 1)
+#       x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
+#       output = F.log_softmax(x, dim=1)
+#       return output
+#
+# This results in the following code:
+
+
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size=3, groups=in_ch)
+        self.pointwise = nn.Conv2d(in_ch, out_ch, kernel_size=1)
+
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))
+
+
+@model_wrapper
+class ModelSpace(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        # LayerChoice is used to select a layer between Conv2d and DwConv.
+        self.conv2 = nn.LayerChoice([
+            nn.Conv2d(32, 64, 3, 1),
+            DepthwiseSeparableConv(32, 64)
+        ])
+        # ValueChoice is used to select a dropout rate.
+        # ValueChoice can be used as parameter of modules wrapped in `nni.retiarii.nn.pytorch`
+        # or customized modules wrapped with `@basic_unit`.
+        self.dropout1 = nn.Dropout(nn.ValueChoice([0.25, 0.5, 0.75]))  # choose dropout rate from 0.25, 0.5 and 0.75
+        self.dropout2 = nn.Dropout(0.5)
+        feature = nn.ValueChoice([64, 128, 256])
+        self.fc1 = nn.Linear(9216, feature)
+        self.fc2 = nn.Linear(feature, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(self.conv2(x), 2)
+        x = torch.flatten(self.dropout1(x), 1)
+        x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+model_space = ModelSpace()
+model_space
+
+# %%
+# This example uses two mutation APIs,
+# :class:`nn.LayerChoice <nni.retiarii.nn.pytorch.LayerChoice>` and
+# :class:`nn.InputChoice <nni.retiarii.nn.pytorch.ValueChoice>`.
+# :class:`nn.LayerChoice <nni.retiarii.nn.pytorch.LayerChoice>`
+# takes a list of candidate modules (two in this example), one will be chosen for each sampled model.
+# It can be used like normal PyTorch module.
+# :class:`nn.InputChoice <nni.retiarii.nn.pytorch.ValueChoice>` takes a list of candidate values,
+# one will be chosen to take effect for each sampled model.
+#
+# More detailed API description and usage can be found :doc:`here </nas/construct_space>`.
+#
+# .. note::
+#
+#     We are actively enriching the mutation APIs, to facilitate easy construction of model space.
+#     If the currently supported mutation APIs cannot express your model space,
+#     please refer to :doc:`this doc </nas/mutator>` for customizing mutators.
+#
+# Explore the Defined Model Space
+# -------------------------------
+#
+# There are basically two exploration approaches: (1) search by evaluating each sampled model independently,
+# which is the search approach in :ref:`multi-trial NAS <multi-trial-nas>`
+# and (2) one-shot weight-sharing based search, which is used in one-shot NAS.
+# We demonstrate the first approach in this tutorial. Users can refer to :ref:`here <one-shot-nas>` for the second approach.
+#
+# First, users need to pick a proper exploration strategy to explore the defined model space.
+# Second, users need to pick or customize a model evaluator to evaluate the performance of each explored model.
+#
+# Pick an exploration strategy
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Retiarii supports many :doc:`exploration strategies </nas/exploration_strategy>`.
+#
+# Simply choosing (i.e., instantiate) an exploration strategy as below.
+
+import nni.retiarii.strategy as strategy
+search_strategy = strategy.Random(dedup=True)  # dedup=False if deduplication is not wanted
+
+# %%
+# Pick or customize a model evaluator
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# In the exploration process, the exploration strategy repeatedly generates new models. A model evaluator is for training
+# and validating each generated model to obtain the model's performance.
+# The performance is sent to the exploration strategy for the strategy to generate better models.
+#
+# Retiarii has provided :doc:`built-in model evaluators </nas/evaluator>`, but to start with,
+# it is recommended to use :class:`FunctionalEvaluator <nni.retiarii.evaluator.FunctionalEvaluator>`,
+# that is, to wrap your own training and evaluation code with one single function.
+# This function should receive one single model class and uses :func:`nni.report_final_result` to report the final score of this model.
+#
+# An example here creates a simple evaluator that runs on MNIST dataset, trains for 2 epochs, and reports its validation accuracy.
+
+import nni
+
+from torchvision import transforms
+from torchvision.datasets import MNIST
+from torch.utils.data import DataLoader
+
+
+def train_epoch(model, device, train_loader, optimizer, epoch):
+    loss_fn = torch.nn.CrossEntropyLoss()
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = loss_fn(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % 10 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test_epoch(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+    accuracy = 100. * correct / len(test_loader.dataset)
+
+    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
+          correct, len(test_loader.dataset), accuracy))
+
+    return accuracy
+
+
+def evaluate_model(model_cls):
+    # "model_cls" is a class, need to instantiate
+    model = model_cls()
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    model.to(device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    transf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_loader = DataLoader(MNIST('data/mnist', download=True, transform=transf), batch_size=64, shuffle=True)
+    test_loader = DataLoader(MNIST('data/mnist', download=True, train=False, transform=transf), batch_size=64)
+
+    for epoch in range(3):
+        # train the model for one epoch
+        train_epoch(model, device, train_loader, optimizer, epoch)
+        # test the model for one epoch
+        accuracy = test_epoch(model, device, test_loader)
+        # call report intermediate result. Result can be float or dict
+        nni.report_intermediate_result(accuracy)
+
+    # report final test result
+    nni.report_final_result(accuracy)
+
+
+# %%
+# Create the evaluator
+
+from nni.retiarii.evaluator import FunctionalEvaluator
+evaluator = FunctionalEvaluator(evaluate_model)
+
+# %%
+#
+# The ``train_epoch`` and ``test_epoch`` here can be any customized function,
+# where users can write their own training recipe.
+#
+# It is recommended that the ``evaluate_model`` here accepts no additional arguments other than ``model_cls``.
+# However, in the :doc:`advanced tutorial </nas/evaluator>`, we will show how to use additional arguments in case you actually need those.
+# In future, we will support mutation on the arguments of evaluators, which is commonly called "Hyper-parmeter tuning".
+#
+# Launch an Experiment
+# --------------------
+#
+# After all the above are prepared, it is time to start an experiment to do the model search. An example is shown below.
+
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+exp = RetiariiExperiment(model_space, evaluator, [], search_strategy)
+exp_config = RetiariiExeConfig('local')
+exp_config.experiment_name = 'mnist_search'
+
+# %%
+# The following configurations are useful to control how many trials to run at most / at the same time.
+
+exp_config.max_trial_number = 4   # spawn 4 trials at most
+exp_config.trial_concurrency = 2  # will run two trials concurrently
+
+# %%
+# Remember to set the following config if you want to GPU.
+# ``use_active_gpu`` should be set true if you wish to use an occupied GPU (possibly running a GUI).
+
+exp_config.trial_gpu_number = 1
+exp_config.training_service.use_active_gpu = True
+
+# %%
+# Launch the experiment. The experiment should take several minutes to finish on a workstation with 2 GPUs.
+
+exp.run(exp_config, 8081)
+
+# %%
+# Users can also run Retiarii Experiment with :doc:`different training services </experiment/training_service/overview>`
+# besides ``local`` training service.
+#
+# Visualize the Experiment
+# ------------------------
+#
+# Users can visualize their experiment in the same way as visualizing a normal hyper-parameter tuning experiment.
+# For example, open ``localhost:8081`` in your browser, 8081 is the port that you set in ``exp.run``.
+# Please refer to :doc:`here </experiment/web_portal/web_portal>` for details.
+#
+# We support visualizing models with 3rd-party visualization engines (like `Netron <https://netron.app/>`__).
+# This can be used by clicking ``Visualization`` in detail panel for each trial.
+# Note that current visualization is based on `onnx <https://onnx.ai/>`__ ,
+# thus visualization is not feasible if the model cannot be exported into onnx.
+#
+# Built-in evaluators (e.g., Classification) will automatically export the model into a file.
+# For your own evaluator, you need to save your file into ``$NNI_OUTPUT_DIR/model.onnx`` to make this work.
+# For instance,
+
+import os
+from pathlib import Path
+
+
+def evaluate_model_with_visualization(model_cls):
+    model = model_cls()
+    # dump the model into an onnx
+    if 'NNI_OUTPUT_DIR' in os.environ:
+        dummy_input = torch.zeros(1, 3, 32, 32)
+        torch.onnx.export(model, (dummy_input, ),
+                          Path(os.environ['NNI_OUTPUT_DIR']) / 'model.onnx')
+    evaluate_model(model_cls)
+
+# %%
+# Relaunch the experiment, and a button is shown on Web portal.
+#
+# .. image:: ../../img/netron_entrance_webui.png
+#
+# Export Top Models
+# -----------------
+#
+# Users can export top models after the exploration is done using ``export_top_models``.
+
+for model_dict in exp.export_top_models(formatter='dict'):
+    print(model_dict)
+
+# %%
+# The output is ``json`` object which records the mutation actions of the top model.
+# If users want to output source code of the top model,
+# they can use :ref:`graph-based execution engine <graph-based-execution-engine>` for the experiment,
+# by simply adding the following two lines.
+
+exp_config.execution_engine = 'base'
+export_formatter = 'code'
--- a/examples/tutorials/hpo_nnictl/config.yaml
+++ b/examples/tutorials/hpo_nnictl/config.yaml
+search_space:
+  features:
+    _type: choice
+    _value: [ 128, 256, 512, 1024 ]
+  lr:
+    _type: loguniform
+    _value: [ 0.0001, 0.1 ]
+  momentum:
+    _type: uniform
+    _value: [ 0, 1 ]
+
+trial_command: python model.py
+trial_code_directory: .
+
+trial_concurrency: 2
+max_trial_number: 10
+
+tuner:
+  name: TPE
+  class_args:
+    optimize_mode: maximize
+
+training_service:
+  platform: local
--- a/examples/tutorials/hpo_nnictl/model.py
+++ b/examples/tutorials/hpo_nnictl/model.py
+"""
+Port PyTorch Quickstart to NNI
+==============================
+This is a modified version of `PyTorch quickstart`_.
+
+It can be run directly and will have the exact same result as original version.
+
+Furthermore, it enables the ability of auto tuning with an NNI *experiment*, which will be detailed later.
+
+It is recommended to run this script directly first to verify the environment.
+
+There are 2 key differences from the original version:
+
+1. In `Get optimized hyperparameters`_ part, it receives generated hyperparameters.
+2. In `Train model and report accuracy`_ part, it reports accuracy metrics to NNI.
+
+.. _PyTorch quickstart: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
+"""
+
+# %%
+import nni
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+# %%
+# Hyperparameters to be tuned
+# ---------------------------
+# These are the hyperparameters that will be tuned.
+params = {
+    'features': 512,
+    'lr': 0.001,
+    'momentum': 0,
+}
+
+# %%
+# Get optimized hyperparameters
+# -----------------------------
+# If run directly, :func:`nni.get_next_parameter` is a no-op and returns an empty dict.
+# But with an NNI *experiment*, it will receive optimized hyperparameters from tuning algorithm.
+optimized_params = nni.get_next_parameter()
+params.update(optimized_params)
+print(params)
+
+# %%
+# Load dataset
+# ------------
+training_data = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
+test_data = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
+
+batch_size = 64
+
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+# %%
+# Build model with hyperparameters
+# --------------------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, params['features']),
+            nn.ReLU(),
+            nn.Linear(params['features'], params['features']),
+            nn.ReLU(),
+            nn.Linear(params['features'], 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], momentum=params['momentum'])
+
+# %%
+# Define train and test
+# ---------------------
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+        pred = model(X)
+        loss = loss_fn(pred, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    return correct
+
+# %%
+# Train model and report accuracy
+# -------------------------------
+# Report accuracy metrics to NNI so the tuning algorithm can suggest better hyperparameters.
+epochs = 5
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    accuracy = test(test_dataloader, model, loss_fn)
+    nni.report_intermediate_result(accuracy)
+nni.report_final_result(accuracy)
--- a/examples/tutorials/hpo_nnictl/nnictl.rst
+++ b/examples/tutorials/hpo_nnictl/nnictl.rst
+Run HPO Experiment with nnictl
+==============================
+
+This tutorial has exactly the same effect as :doc:`PyTorch quickstart <../hpo_quickstart_pytorch/main>`.
+
+Both tutorials optimize the model in `official PyTorch quickstart
+<https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html>`__ with auto-tuning,
+while this one manages the experiment with command line tool and YAML config file, instead of pure Python code.
+
+The tutorial consists of 4 steps: 
+
+1. Modify the model for auto-tuning.
+2. Define hyperparameters' search space.
+3. Create config file.
+4. Run the experiment.
+
+The first two steps are identical to quickstart.
+
+Step 1: Prepare the model
+-------------------------
+In first step, we need to prepare the model to be tuned.
+
+The model should be put in a separate script.
+It will be evaluated many times concurrently,
+and possibly will be trained on distributed platforms.
+
+In this tutorial, the model is defined in :doc:`model.py <model>`.
+
+In short, it is a PyTorch model with 3 additional API calls:
+
+1. Use :func:`nni.get_next_parameter` to fetch the hyperparameters to be evalutated.
+2. Use :func:`nni.report_intermediate_result` to report per-epoch accuracy metrics.
+3. Use :func:`nni.report_final_result` to report final accuracy.
+
+Please understand the model code before continue to next step.
+
+Step 2: Define search space
+---------------------------
+In model code, we have prepared 3 hyperparameters to be tuned:
+*features*, *lr*, and *momentum*.
+
+Here we need to define their *search space* so the tuning algorithm can sample them in desired range.
+
+Assuming we have following prior knowledge for these hyperparameters:
+
+1. *features* should be one of 128, 256, 512, 1024.
+2. *lr* should be a float between 0.0001 and 0.1, and it follows exponential distribution.
+3. *momentum* should be a float between 0 and 1.
+
+In NNI, the space of *features* is called ``choice``;
+the space of *lr* is called ``loguniform``;
+and the space of *momentum* is called ``uniform``.
+You may have noticed, these names are derived from ``numpy.random``.
+
+For full specification of search space, check :doc:`the reference </hpo/search_space>`.
+
+Now we can define the search space as follow:
+
+.. code-block:: yaml
+
+    search_space:
+      features:
+        _type: choice
+        _value: [ 128, 256, 512, 1024 ]
+      lr:
+        _type: loguniform
+        _value: [ 0.0001, 0.1 ]
+      momentum:
+        _type: uniform
+        _value: [ 0, 1 ]
+
+Step 3: Configure the experiment
+--------------------------------
+NNI uses an *experiment* to manage the HPO process.
+The *experiment config* defines how to train the models and how to explore the search space.
+
+In this tutorial we use a YAML file ``config.yaml`` to define the experiment.
+
+Configure trial code
+^^^^^^^^^^^^^^^^^^^^
+In NNI evaluation of each hyperparameter set is called a *trial*.
+So the model script is called *trial code*.
+
+.. code-block:: yaml
+
+    trial_command: python model.py
+    trial_code_directory: .
+
+When ``trial_code_directory`` is a relative path, it relates to the config file.
+So in this case we need to put ``config.yaml`` and ``model.py`` in the same directory.
+
+.. attention::
+
+    The rules for resolving relative path are different in YAML config file and :doc:`Python experiment API </reference/experiment>`.
+    In Python experiment API relative paths are relative to current working directory.
+
+Configure how many trials to run
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Here we evaluate 10 sets of hyperparameters in total, and concurrently evaluate 2 sets at a time.
+
+.. code-block:: yaml
+
+    max_trial_number: 10
+    trial_concurrency: 2
+
+You may also set ``max_experiment_duration = '1h'`` to limit running time.
+
+If neither ``max_trial_number`` nor ``max_experiment_duration`` are set,
+the experiment will run forever until you stop it.
+
+.. note::
+
+    ``max_trial_number`` is set to 10 here for a fast example.
+    In real world it should be set to a larger number.
+    With default config TPE tuner requires 20 trials to warm up.
+
+
+Configure tuning algorithm
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+Here we use :doc:`TPE tuner </hpo/tuners>`.
+
+.. code-block:: yaml
+
+    name: TPE
+    class_args:
+      optimize_mode: maximize
+
+Configure training service
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this tutorial we use *local* mode,
+which means models will be trained on local machine, without using any special training platform.
+
+.. code-block:: yaml
+
+    training_service:
+      platform: local
+
+Wrap up
+^^^^^^^
+
+The full content of ``config.yaml`` is as follow:
+
+.. code-block:: yaml
+
+    search_space:
+      features:
+        _type: choice
+        _value: [ 128, 256, 512, 1024 ]
+      lr:
+        _type: loguniform
+        _value: [ 0.0001, 0.1 ]
+      momentum:
+        _type: uniform
+        _value: [ 0, 1 ]
+    
+    trial_command: python model.py
+    trial_code_directory: .
+
+    trial_concurrency: 2
+    max_trial_number: 10
+    
+    tuner:
+      name: TPE
+      class_args:
+        optimize_mode: maximize
+    
+    training_service:
+      platform: local
+
+Step 4: Run the experiment
+--------------------------
+Now the experiment is ready. Launch it with ``nnictl create`` command:
+
+.. code-block:: bash
+
+    $ nnictl create --config config.yaml --port 8080
+
+You can use the web portal to view experiment status: http://localhost:8080.
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    [2022-04-01 12:00:00] Creating experiment, Experiment ID: p43ny6ew
+    [2022-04-01 12:00:00] Starting web server...
+    [2022-04-01 12:00:01] Setting up...
+    [2022-04-01 12:00:01] Web portal URLs: http://127.0.0.1:8080 http://192.168.1.1:8080
+    [2022-04-01 12:00:01] To stop experiment run "nnictl stop p43ny6ew" or "nnictl stop --all"
+    [2022-04-01 12:00:01] Reference: https://nni.readthedocs.io/en/stable/reference/nnictl.html
+
+When the experiment is done, use ``nnictl stop`` command to stop it.
+
+.. code-block:: bash
+
+    $ nnictl stop p43ny6ew
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    INFO:  Stopping experiment 7u8yg9zw
+    INFO:  Stop experiment success.
--- a/examples/tutorials/hpo_quickstart_pytorch/README.rst
+++ b/examples/tutorials/hpo_quickstart_pytorch/README.rst
--- a/examples/tutorials/hpo_quickstart_pytorch/main.py
+++ b/examples/tutorials/hpo_quickstart_pytorch/main.py
+"""
+HPO Quickstart with PyTorch
+===========================
+This tutorial optimizes the model in `official PyTorch quickstart`_ with auto-tuning.
+
+The tutorial consists of 4 steps: 
+
+1. Modify the model for auto-tuning.
+2. Define hyperparameters' search space.
+3. Configure the experiment.
+4. Run the experiment.
+
+.. _official PyTorch quickstart: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
+"""
+
+# %%
+# Step 1: Prepare the model
+# -------------------------
+# In first step, we need to prepare the model to be tuned.
+#
+# The model should be put in a separate script.
+# It will be evaluated many times concurrently,
+# and possibly will be trained on distributed platforms.
+#
+# In this tutorial, the model is defined in :doc:`model.py <model>`.
+#
+# In short, it is a PyTorch model with 3 additional API calls:
+#
+# 1. Use :func:`nni.get_next_parameter` to fetch the hyperparameters to be evalutated.
+# 2. Use :func:`nni.report_intermediate_result` to report per-epoch accuracy metrics.
+# 3. Use :func:`nni.report_final_result` to report final accuracy.
+#
+# Please understand the model code before continue to next step.
+
+# %%
+# Step 2: Define search space
+# ---------------------------
+# In model code, we have prepared 3 hyperparameters to be tuned:
+# *features*, *lr*, and *momentum*.
+#
+# Here we need to define their *search space* so the tuning algorithm can sample them in desired range.
+#
+# Assuming we have following prior knowledge for these hyperparameters:
+#
+# 1. *features* should be one of 128, 256, 512, 1024.
+# 2. *lr* should be a float between 0.0001 and 0.1, and it follows exponential distribution.
+# 3. *momentum* should be a float between 0 and 1.
+#
+# In NNI, the space of *features* is called ``choice``;
+# the space of *lr* is called ``loguniform``;
+# and the space of *momentum* is called ``uniform``.
+# You may have noticed, these names are derived from ``numpy.random``.
+#
+# For full specification of search space, check :doc:`the reference </hpo/search_space>`.
+#
+# Now we can define the search space as follow:
+
+search_space = {
+    'features': {'_type': 'choice', '_value': [128, 256, 512, 1024]},
+    'lr': {'_type': 'loguniform', '_value': [0.0001, 0.1]},
+    'momentum': {'_type': 'uniform', '_value': [0, 1]},
+}
+
+# %%
+# Step 3: Configure the experiment
+# --------------------------------
+# NNI uses an *experiment* to manage the HPO process.
+# The *experiment config* defines how to train the models and how to explore the search space.
+# 
+# In this tutorial we use a *local* mode experiment,
+# which means models will be trained on local machine, without using any special training platform.
+from nni.experiment import Experiment
+experiment = Experiment('local')
+
+# %%
+# Now we start to configure the experiment.
+#
+# Configure trial code
+# ^^^^^^^^^^^^^^^^^^^^
+# In NNI evaluation of each hyperparameter set is called a *trial*.
+# So the model script is called *trial code*.
+experiment.config.trial_command = 'python model.py'
+experiment.config.trial_code_directory = '.'
+# %%
+# When ``trial_code_directory`` is a relative path, it relates to current working directory.
+# To run ``main.py`` in a different path, you can set trial code directory to ``Path(__file__).parent``.
+# (`__file__ <https://docs.python.org/3.10/reference/datamodel.html#index-43>`__
+# is only available in standard Python, not in Jupyter Notebook.)
+#
+# .. attention::
+#
+#     If you are using Linux system without Conda,
+#     you may need to change ``"python model.py"`` to ``"python3 model.py"``.
+
+# %%
+# Configure search space
+# ^^^^^^^^^^^^^^^^^^^^^^
+experiment.config.search_space = search_space
+
+# %%
+# Configure tuning algorithm
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Here we use :doc:`TPE tuner </hpo/tuners>`.
+experiment.config.tuner.name = 'TPE'
+experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
+
+# %%
+# Configure how many trials to run
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Here we evaluate 10 sets of hyperparameters in total, and concurrently evaluate 2 sets at a time.
+experiment.config.max_trial_number = 10
+experiment.config.trial_concurrency = 2
+# %%
+# You may also set ``max_experiment_duration = '1h'`` to limit running time.
+#
+# If neither ``max_trial_number`` nor ``max_experiment_duration`` are set,
+# the experiment will run forever until you press Ctrl-C.
+#
+# .. note::
+#
+#     ``max_trial_number`` is set to 10 here for a fast example.
+#     In real world it should be set to a larger number.
+#     With default config TPE tuner requires 20 trials to warm up.
+
+# %%
+# Step 4: Run the experiment
+# --------------------------
+# Now the experiment is ready. Choose a port and launch it. (Here we use port 8080.)
+#
+# You can use the web portal to view experiment status: http://localhost:8080.
+experiment.run(8080)
+
+# %%
+# After the experiment is done
+# ----------------------------
+# Everything is done and it is safe to exit now. The following are optional.
+#
+# If you are using standard Python instead of Jupyter Notebook,
+# you can add ``input()`` or ``signal.pause()`` to prevent Python from exiting,
+# allowing you to view the web portal after the experiment is done.
+
+# input('Press enter to quit')
+experiment.stop()
+
+# %%
+# :meth:`nni.experiment.Experiment.stop` is automatically invoked when Python exits,
+# so it can be omitted in your code.
+#
+# After the experiment is stopped, you can run :meth:`nni.experiment.Experiment.view` to restart web portal.
+#
+# .. tip::
+#
+#     This example uses :doc:`Python API </reference/experiment>` to create experiment.
+#
+#     You can also create and manage experiments with :doc:`command line tool <../hpo_nnictl/nnictl>`.
--- a/examples/tutorials/hpo_quickstart_pytorch/model.py
+++ b/examples/tutorials/hpo_quickstart_pytorch/model.py
+"""
+Port PyTorch Quickstart to NNI
+==============================
+This is a modified version of `PyTorch quickstart`_.
+
+It can be run directly and will have the exact same result as original version.
+
+Furthermore, it enables the ability of auto tuning with an NNI *experiment*, which will be detailed later.
+
+It is recommended to run this script directly first to verify the environment.
+
+There are 2 key differences from the original version:
+
+1. In `Get optimized hyperparameters`_ part, it receives generated hyperparameters.
+2. In `Train model and report accuracy`_ part, it reports accuracy metrics to NNI.
+
+.. _PyTorch quickstart: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
+"""
+
+# %%
+import nni
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+# %%
+# Hyperparameters to be tuned
+# ---------------------------
+# These are the hyperparameters that will be tuned.
+params = {
+    'features': 512,
+    'lr': 0.001,
+    'momentum': 0,
+}
+
+# %%
+# Get optimized hyperparameters
+# -----------------------------
+# If run directly, :func:`nni.get_next_parameter` is a no-op and returns an empty dict.
+# But with an NNI *experiment*, it will receive optimized hyperparameters from tuning algorithm.
+optimized_params = nni.get_next_parameter()
+params.update(optimized_params)
+print(params)
+
+# %%
+# Load dataset
+# ------------
+training_data = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
+test_data = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
+
+batch_size = 64
+
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+# %%
+# Build model with hyperparameters
+# --------------------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, params['features']),
+            nn.ReLU(),
+            nn.Linear(params['features'], params['features']),
+            nn.ReLU(),
+            nn.Linear(params['features'], 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], momentum=params['momentum'])
+
+# %%
+# Define train and test
+# ---------------------
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+        pred = model(X)
+        loss = loss_fn(pred, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    return correct
+
+# %%
+# Train model and report accuracy
+# -------------------------------
+# Report accuracy metrics to NNI so the tuning algorithm can suggest better hyperparameters.
+epochs = 5
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    accuracy = test(test_dataloader, model, loss_fn)
+    nni.report_intermediate_result(accuracy)
+nni.report_final_result(accuracy)
--- a/examples/tutorials/hpo_quickstart_tensorflow/README.rst
+++ b/examples/tutorials/hpo_quickstart_tensorflow/README.rst
--- a/examples/tutorials/hpo_quickstart_tensorflow/main.py
+++ b/examples/tutorials/hpo_quickstart_tensorflow/main.py
+"""
+HPO Quickstart with TensorFlow
+==============================
+This tutorial optimizes the model in `official TensorFlow quickstart`_ with auto-tuning.
+
+The tutorial consists of 4 steps: 
+
+1. Modify the model for auto-tuning.
+2. Define hyperparameters' search space.
+3. Configure the experiment.
+4. Run the experiment.
+
+.. _official TensorFlow quickstart: https://www.tensorflow.org/tutorials/quickstart/beginner
+"""
+
+# %%
+# Step 1: Prepare the model
+# -------------------------
+# In first step, we need to prepare the model to be tuned.
+#
+# The model should be put in a separate script.
+# It will be evaluated many times concurrently,
+# and possibly will be trained on distributed platforms.
+#
+# In this tutorial, the model is defined in :doc:`model.py <model>`.
+#
+# In short, it is a TensorFlow model with 3 additional API calls:
+#
+# 1. Use :func:`nni.get_next_parameter` to fetch the hyperparameters to be evalutated.
+# 2. Use :func:`nni.report_intermediate_result` to report per-epoch accuracy metrics.
+# 3. Use :func:`nni.report_final_result` to report final accuracy.
+#
+# Please understand the model code before continue to next step.
+
+# %%
+# Step 2: Define search space
+# ---------------------------
+# In model code, we have prepared 4 hyperparameters to be tuned:
+# *dense_units*, *activation_type*, *dropout_rate*, and *learning_rate*.
+#
+# Here we need to define their *search space* so the tuning algorithm can sample them in desired range.
+#
+# Assuming we have following prior knowledge for these hyperparameters:
+#
+# 1. *dense_units* should be one of 64, 128, 256.
+# 2. *activation_type* should be one of 'relu', 'tanh', 'swish', or None.
+# 3. *dropout_rate* should be a float between 0.5 and 0.9.
+# 4. *learning_rate* should be a float between 0.0001 and 0.1, and it follows exponential distribution.
+#
+# In NNI, the space of *dense_units* and *activation_type* is called ``choice``;
+# the space of *dropout_rate* is called ``uniform``;
+# and the space of *learning_rate* is called ``loguniform``.
+# You may have noticed, these names are derived from ``numpy.random``.
+#
+# For full specification of search space, check :doc:`the reference </hpo/search_space>`.
+#
+# Now we can define the search space as follow:
+
+search_space = {
+    'dense_units': {'_type': 'choice', '_value': [64, 128, 256]},
+    'activation_type': {'_type': 'choice', '_value': ['relu', 'tanh', 'swish', None]},
+    'dropout_rate': {'_type': 'uniform', '_value': [0.5, 0.9]},
+    'learning_rate': {'_type': 'loguniform', '_value': [0.0001, 0.1]},
+}
+
+# %%
+# Step 3: Configure the experiment
+# --------------------------------
+# NNI uses an *experiment* to manage the HPO process.
+# The *experiment config* defines how to train the models and how to explore the search space.
+# 
+# In this tutorial we use a *local* mode experiment,
+# which means models will be trained on local machine, without using any special training platform.
+from nni.experiment import Experiment
+experiment = Experiment('local')
+
+# %%
+# Now we start to configure the experiment.
+#
+# Configure trial code
+# ^^^^^^^^^^^^^^^^^^^^
+# In NNI evaluation of each hyperparameter set is called a *trial*.
+# So the model script is called *trial code*.
+experiment.config.trial_command = 'python model.py'
+experiment.config.trial_code_directory = '.'
+# %%
+# When ``trial_code_directory`` is a relative path, it relates to current working directory.
+# To run ``main.py`` in a different path, you can set trial code directory to ``Path(__file__).parent``.
+# (`__file__ <https://docs.python.org/3.10/reference/datamodel.html#index-43>`__
+# is only available in standard Python, not in Jupyter Notebook.)
+#
+# .. attention::
+#
+#     If you are using Linux system without Conda,
+#     you may need to change ``"python model.py"`` to ``"python3 model.py"``.
+
+# %%
+# Configure search space
+# ^^^^^^^^^^^^^^^^^^^^^^
+experiment.config.search_space = search_space
+
+# %%
+# Configure tuning algorithm
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Here we use :doc:`TPE tuner </hpo/tuners>`.
+experiment.config.tuner.name = 'TPE'
+experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
+
+# %%
+# Configure how many trials to run
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Here we evaluate 10 sets of hyperparameters in total, and concurrently evaluate 2 sets at a time.
+experiment.config.max_trial_number = 10
+experiment.config.trial_concurrency = 2
+# %%
+# You may also set ``max_experiment_duration = '1h'`` to limit running time.
+#
+# If neither ``max_trial_number`` nor ``max_experiment_duration`` are set,
+# the experiment will run forever until you press Ctrl-C.
+#
+# .. note::
+#
+#     ``max_trial_number`` is set to 10 here for a fast example.
+#     In real world it should be set to a larger number.
+#     With default config TPE tuner requires 20 trials to warm up.
+
+# %%
+# Step 4: Run the experiment
+# --------------------------
+# Now the experiment is ready. Choose a port and launch it. (Here we use port 8080.)
+#
+# You can use the web portal to view experiment status: http://localhost:8080.
+experiment.run(8080)
+
+# %%
+# After the experiment is done
+# ----------------------------
+# Everything is done and it is safe to exit now. The following are optional.
+#
+# If you are using standard Python instead of Jupyter Notebook,
+# you can add ``input()`` or ``signal.pause()`` to prevent Python from exiting,
+# allowing you to view the web portal after the experiment is done.
+
+# input('Press enter to quit')
+experiment.stop()
+
+# %%
+# :meth:`nni.experiment.Experiment.stop` is automatically invoked when Python exits,
+# so it can be omitted in your code.
+#
+# After the experiment is stopped, you can run :meth:`nni.experiment.Experiment.view` to restart web portal.
+#
+# .. tip::
+#
+#     This example uses :doc:`Python API </reference/experiment>` to create experiment.
+#
+#     You can also create and manage experiments with :doc:`command line tool <../hpo_nnictl/nnictl>`.
--- a/examples/tutorials/hpo_quickstart_tensorflow/model.py
+++ b/examples/tutorials/hpo_quickstart_tensorflow/model.py
+"""
+Port TensorFlow Quickstart to NNI
+=================================
+This is a modified version of `TensorFlow quickstart`_.
+
+It can be run directly and will have the exact same result as original version.
+
+Furthermore, it enables the ability of auto tuning with an NNI *experiment*, which will be detailed later.
+
+It is recommended to run this script directly first to verify the environment.
+
+There are 3 key differences from the original version:
+
+1. In `Get optimized hyperparameters`_ part, it receives generated hyperparameters.
+2. In `(Optional) Report intermediate results`_ part, it reports per-epoch accuracy metrics.
+3. In `Report final result`_ part, it reports final accuracy.
+
+.. _TensorFlow quickstart: https://www.tensorflow.org/tutorials/quickstart/beginner
+"""
+
+# %%
+import nni
+import tensorflow as tf
+
+# %%
+# Hyperparameters to be tuned
+# ---------------------------
+# These are the hyperparameters that will be tuned later.
+params = {
+    'dense_units': 128,
+    'activation_type': 'relu',
+    'dropout_rate': 0.2,
+    'learning_rate': 0.001,
+}
+
+# %%
+# Get optimized hyperparameters
+# -----------------------------
+# If run directly, :func:`nni.get_next_parameter` is a no-op and returns an empty dict.
+# But with an NNI *experiment*, it will receive optimized hyperparameters from tuning algorithm.
+optimized_params = nni.get_next_parameter()
+params.update(optimized_params)
+print(params)
+
+# %%
+# Load dataset
+# ------------
+mnist = tf.keras.datasets.mnist
+
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+x_train, x_test = x_train / 255.0, x_test / 255.0
+
+# %%
+# Build model with hyperparameters
+# --------------------------------
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Flatten(input_shape=(28, 28)),
+    tf.keras.layers.Dense(params['dense_units'], activation=params['activation_type']),
+    tf.keras.layers.Dropout(params['dropout_rate']),
+    tf.keras.layers.Dense(10)
+])
+
+adam = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
+loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+model.compile(optimizer=adam, loss=loss_fn, metrics=['accuracy'])
+
+# %%
+# (Optional) Report intermediate results
+# --------------------------------------
+# The callback reports per-epoch accuracy to show learning curve in the web portal.
+# You can also leverage the metrics for early stopping with :doc:`NNI assessors </hpo/assessors>`.
+#
+# This part can be safely skipped and the experiment will work fine.
+callback = tf.keras.callbacks.LambdaCallback(
+    on_epoch_end = lambda epoch, logs: nni.report_intermediate_result(logs['accuracy'])
+)
+
+# %%
+# Train and evluate the model
+# ---------------------------
+model.fit(x_train, y_train, epochs=5, verbose=2, callbacks=[callback])
+loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
+
+# %%
+# Report final result
+# -------------------
+# Report final accuracy to NNI so the tuning algorithm can suggest better hyperparameters.
+nni.report_final_result(accuracy)
--- a/examples/tutorials/nas_quick_start_mnist.py
+++ b/examples/tutorials/nas_quick_start_mnist.py
-"""
-Get started with NAS on MNIST
-=============================
-"""
-
-# %%
-a = (1, 2, 3)
-a
-
-# %%
-print('hello')
--- a/examples/tutorials/nasbench_as_dataset.py
+++ b/examples/tutorials/nasbench_as_dataset.py
+"""
+Use NAS Benchmarks as Datasets
+==============================
+
+In this tutorial, we show how to use NAS Benchmarks as datasets.
+For research purposes we sometimes desire to query the benchmarks for architecture accuracies,
+rather than train them one by one from scratch.
+NNI has provided query tools so that users can easily get the retrieve the data in NAS benchmarks.
+"""
+
+# %%
+# Prerequisites
+# -------------
+# This tutorial assumes that you have already prepared your NAS benchmarks under cache directory
+# (by default, ``~/.cache/nni/nasbenchmark``).
+# If you haven't, please follow the data preparation guide in :doc:`/nas/benchmarks`.
+#
+# As a result, the directory should look like:
+
+import os
+os.listdir(os.path.expanduser('~/.cache/nni/nasbenchmark'))
+
+# %%
+import pprint
+
+from nni.nas.benchmarks.nasbench101 import query_nb101_trial_stats
+from nni.nas.benchmarks.nasbench201 import query_nb201_trial_stats
+from nni.nas.benchmarks.nds import query_nds_trial_stats
+
+# %%
+# NAS-Bench-101
+# -------------
+#
+# Use the following architecture as an example:
+#
+# .. image:: ../../img/nas-bench-101-example.png
+
+arch = {
+    'op1': 'conv3x3-bn-relu',
+    'op2': 'maxpool3x3',
+    'op3': 'conv3x3-bn-relu',
+    'op4': 'conv3x3-bn-relu',
+    'op5': 'conv1x1-bn-relu',
+    'input1': [0],
+    'input2': [1],
+    'input3': [2],
+    'input4': [0],
+    'input5': [0, 3, 4],
+    'input6': [2, 5]
+}
+for t in query_nb101_trial_stats(arch, 108, include_intermediates=True):
+    pprint.pprint(t)
+
+# %%
+# An architecture of NAS-Bench-101 could be trained more than once.
+# Each element of the returned generator is a dict which contains one of the training results of this trial config
+# (architecture + hyper-parameters) including train/valid/test accuracy,
+# training time, number of epochs, etc. The results of NAS-Bench-201 and NDS follow similar formats.
+#
+# NAS-Bench-201
+# -------------
+#
+# Use the following architecture as an example:
+#
+# .. image:: ../../img/nas-bench-201-example.png
+
+arch = {
+    '0_1': 'avg_pool_3x3',
+    '0_2': 'conv_1x1',
+    '1_2': 'skip_connect',
+    '0_3': 'conv_1x1',
+    '1_3': 'skip_connect',
+    '2_3': 'skip_connect'
+}
+for t in query_nb201_trial_stats(arch, 200, 'cifar100'):
+    pprint.pprint(t)
+
+# %%
+# Intermediate results are also available.
+
+for t in query_nb201_trial_stats(arch, None, 'imagenet16-120', include_intermediates=True):
+    print(t['config'])
+    print('Intermediates:', len(t['intermediates']))
+
+# %%
+# NDS
+# ---
+#
+# Use the following architecture as an example:
+#
+# .. image:: ../../img/nas-bench-nds-example.png
+#
+# Here, ``bot_muls``, ``ds``, ``num_gs``, ``ss`` and ``ws`` stand for "bottleneck multipliers",
+# "depths", "number of groups", "strides" and "widths" respectively.
+
+# %%
+model_spec = {
+    'bot_muls': [0.0, 0.25, 0.25, 0.25],
+    'ds': [1, 16, 1, 4],
+    'num_gs': [1, 2, 1, 2],
+    'ss': [1, 1, 2, 2],
+    'ws': [16, 64, 128, 16]
+}
+
+# %%
+# Use none as a wildcard.
+for t in query_nds_trial_stats('residual_bottleneck', None, None, model_spec, None, 'cifar10'):
+    pprint.pprint(t)
+
+# %%
+model_spec = {
+    'bot_muls': [0.0, 0.25, 0.25, 0.25],
+    'ds': [1, 16, 1, 4],
+    'num_gs': [1, 2, 1, 2],
+    'ss': [1, 1, 2, 2],
+    'ws': [16, 64, 128, 16]
+}
+for t in query_nds_trial_stats('residual_bottleneck', None, None, model_spec, None, 'cifar10', include_intermediates=True):
+    pprint.pprint(t['intermediates'][:10])
+
+# %%
+model_spec = {'ds': [1, 12, 12, 12], 'ss': [1, 1, 2, 2], 'ws': [16, 24, 24, 40]}
+for t in query_nds_trial_stats('residual_basic', 'resnet', 'random', model_spec, {}, 'cifar10'):
+    pprint.pprint(t)
+
+# %%
+# Get the first one.
+pprint.pprint(next(query_nds_trial_stats('vanilla', None, None, None, None, None)))
+
+# %%
+# Count number.
+model_spec = {'num_nodes_normal': 5, 'num_nodes_reduce': 5, 'depth': 12, 'width': 32, 'aux': False, 'drop_prob': 0.0}
+cell_spec = {
+    'normal_0_op_x': 'avg_pool_3x3',
+    'normal_0_input_x': 0,
+    'normal_0_op_y': 'conv_7x1_1x7',
+    'normal_0_input_y': 1,
+    'normal_1_op_x': 'sep_conv_3x3',
+    'normal_1_input_x': 2,
+    'normal_1_op_y': 'sep_conv_5x5',
+    'normal_1_input_y': 0,
+    'normal_2_op_x': 'dil_sep_conv_3x3',
+    'normal_2_input_x': 2,
+    'normal_2_op_y': 'dil_sep_conv_3x3',
+    'normal_2_input_y': 2,
+    'normal_3_op_x': 'skip_connect',
+    'normal_3_input_x': 4,
+    'normal_3_op_y': 'dil_sep_conv_3x3',
+    'normal_3_input_y': 4,
+    'normal_4_op_x': 'conv_7x1_1x7',
+    'normal_4_input_x': 2,
+    'normal_4_op_y': 'sep_conv_3x3',
+    'normal_4_input_y': 4,
+    'normal_concat': [3, 5, 6],
+    'reduce_0_op_x': 'avg_pool_3x3',
+    'reduce_0_input_x': 0,
+    'reduce_0_op_y': 'dil_sep_conv_3x3',
+    'reduce_0_input_y': 1,
+    'reduce_1_op_x': 'sep_conv_3x3',
+    'reduce_1_input_x': 0,
+    'reduce_1_op_y': 'sep_conv_3x3',
+    'reduce_1_input_y': 0,
+    'reduce_2_op_x': 'skip_connect',
+    'reduce_2_input_x': 2,
+    'reduce_2_op_y': 'sep_conv_7x7',
+    'reduce_2_input_y': 0,
+    'reduce_3_op_x': 'conv_7x1_1x7',
+    'reduce_3_input_x': 4,
+    'reduce_3_op_y': 'skip_connect',
+    'reduce_3_input_y': 4,
+    'reduce_4_op_x': 'conv_7x1_1x7',
+    'reduce_4_input_x': 0,
+    'reduce_4_op_y': 'conv_7x1_1x7',
+    'reduce_4_input_y': 5,
+    'reduce_concat': [3, 6]
+}
+
+for t in query_nds_trial_stats('nas_cell', None, None, model_spec, cell_spec, 'cifar10'):
+    assert t['config']['model_spec'] == model_spec
+    assert t['config']['cell_spec'] == cell_spec
+    pprint.pprint(t)
+
+# %%
+# Count number.
+print('NDS (amoeba) count:', len(list(query_nds_trial_stats(None, 'amoeba', None, None, None, None, None))))