Merge pull request #5036 from microsoft/promote-retiarii-to-nas

[DO NOT SQUASH] Promote retiarii to NAS

Merge pull request #5036 from microsoft/promote-retiarii-to-nas
[DO NOT SQUASH] Promote retiarii to NAS
a0fd0036 · Yuge Zhang · GitHub · d6dcb483 · bc6d8796 · d6dcb483
Unverified Commit a0fd0036 authored Aug 01, 2022 by Yuge Zhang Committed by GitHub Aug 01, 2022
20 changed files
--- a/nni/algorithms/nas/pytorch/spos/__init__.py
+++ b/nni/algorithms/nas/pytorch/spos/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .evolution import SPOSEvolution
-from .mutator import SPOSSupernetTrainingMutator
-from .trainer import SPOSSupernetTrainer
--- a/nni/algorithms/nas/pytorch/spos/evolution.py
+++ b/nni/algorithms/nas/pytorch/spos/evolution.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-import logging
-import os
-import re
-from collections import deque
-
-import numpy as np
-from nni.tuner import Tuner
-from nni.algorithms.nas.pytorch.classic_nas.mutator import LAYER_CHOICE, INPUT_CHOICE
-
-
-_logger = logging.getLogger(__name__)
-
-
-class SPOSEvolution(Tuner):
-    """
-    SPOS evolution tuner.
-
-    Parameters
-    ----------
-    max_epochs : int
-        Maximum number of epochs to run.
-    num_select : int
-        Number of survival candidates of each epoch.
-    num_population : int
-        Number of candidates at the start of each epoch. If candidates generated by
-        crossover and mutation are not enough, the rest will be filled with random
-        candidates.
-    m_prob : float
-        The probability of mutation.
-    num_crossover : int
-        Number of candidates generated by crossover in each epoch.
-    num_mutation : int
-        Number of candidates generated by mutation in each epoch.
-    """
-
-    def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1,
-                 num_crossover=25, num_mutation=25):
-        assert num_population >= num_select
-        self.max_epochs = max_epochs
-        self.num_select = num_select
-        self.num_population = num_population
-        self.m_prob = m_prob
-        self.num_crossover = num_crossover
-        self.num_mutation = num_mutation
-        self.epoch = 0
-        self.candidates = []
-        self.search_space = None
-        self.random_state = np.random.RandomState(0)
-
-        # async status
-        self._to_evaluate_queue = deque()
-        self._sending_parameter_queue = deque()
-        self._pending_result_ids = set()
-        self._reward_dict = dict()
-        self._id2candidate = dict()
-        self._st_callback = None
-
-    def update_search_space(self, search_space):
-        """
-        Handle the initialization/update event of search space.
-        """
-        self._search_space = search_space
-        self._next_round()
-
-    def _next_round(self):
-        _logger.info("Epoch %d, generating...", self.epoch)
-        if self.epoch == 0:
-            self._get_random_population()
-            self.export_results(self.candidates)
-        else:
-            best_candidates = self._select_top_candidates()
-            self.export_results(best_candidates)
-            if self.epoch >= self.max_epochs:
-                return
-            self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates)
-            self._get_random_population()
-        self.epoch += 1
-
-    def _random_candidate(self):
-        chosen_arch = dict()
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                index = self.random_state.randint(len(choices))
-                chosen_arch[key] = {"_value": choices[index], "_idx": index}
-            elif val["_type"] == INPUT_CHOICE:
-                raise NotImplementedError("Input choice is not implemented yet.")
-        return chosen_arch
-
-    def _add_to_evaluate_queue(self, cand):
-        _logger.info("Generate candidate %s, adding to eval queue.", self._get_architecture_repr(cand))
-        self._reward_dict[self._hashcode(cand)] = 0.
-        self._to_evaluate_queue.append(cand)
-
-    def _get_random_population(self):
-        while len(self.candidates) < self.num_population:
-            cand = self._random_candidate()
-            if self._is_legal(cand):
-                _logger.info("Random candidate generated.")
-                self._add_to_evaluate_queue(cand)
-                self.candidates.append(cand)
-
-    def _get_crossover(self, best):
-        result = []
-        for _ in range(10 * self.num_crossover):
-            cand_p1 = best[self.random_state.randint(len(best))]
-            cand_p2 = best[self.random_state.randint(len(best))]
-            assert cand_p1.keys() == cand_p2.keys()
-            cand = {k: cand_p1[k] if self.random_state.randint(2) == 0 else cand_p2[k]
-                    for k in cand_p1.keys()}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_crossover:
-                break
-        _logger.info("Found %d architectures with crossover.", len(result))
-        return result
-
-    def _get_mutation(self, best):
-        result = []
-        for _ in range(10 * self.num_mutation):
-            cand = best[self.random_state.randint(len(best))].copy()
-            mutation_sample = np.random.random_sample(len(cand))
-            for s, k in zip(mutation_sample, cand):
-                if s < self.m_prob:
-                    choices = self._search_space[k]["_value"]
-                    index = self.random_state.randint(len(choices))
-                    cand[k] = {"_value": choices[index], "_idx": index}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_mutation:
-                break
-        _logger.info("Found %d architectures with mutation.", len(result))
-        return result
-
-    def _get_architecture_repr(self, cand):
-        return re.sub(r"\".*?\": \{\"_idx\": (\d+), \"_value\": \".*?\"\}", r"\1",
-                      self._hashcode(cand))
-
-    def _is_legal(self, cand):
-        if self._hashcode(cand) in self._reward_dict:
-            return False
-        return True
-
-    def _select_top_candidates(self):
-        reward_query = lambda cand: self._reward_dict[self._hashcode(cand)]
-        _logger.info("All candidate rewards: %s", list(map(reward_query, self.candidates)))
-        result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select]
-        _logger.info("Best candidate rewards: %s", list(map(reward_query, result)))
-        return result
-
-    @staticmethod
-    def _hashcode(d):
-        return json.dumps(d, sort_keys=True)
-
-    def _bind_and_send_parameters(self):
-        """
-        There are two types of resources: parameter ids and candidates. This function is called at
-        necessary times to bind these resources to send new trials with st_callback.
-        """
-        result = []
-        while self._sending_parameter_queue and self._to_evaluate_queue:
-            parameter_id = self._sending_parameter_queue.popleft()
-            parameters = self._to_evaluate_queue.popleft()
-            self._id2candidate[parameter_id] = parameters
-            result.append(parameters)
-            self._pending_result_ids.add(parameter_id)
-            self._st_callback(parameter_id, parameters)
-            _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters))
-        return result
-
-    def generate_multiple_parameters(self, parameter_id_list, **kwargs):
-        """
-        Callback function necessary to implement a tuner. This will put more parameter ids into the
-        parameter id queue.
-        """
-        if "st_callback" in kwargs and self._st_callback is None:
-            self._st_callback = kwargs["st_callback"]
-        for parameter_id in parameter_id_list:
-            self._sending_parameter_queue.append(parameter_id)
-        self._bind_and_send_parameters()
-        return []  # always not use this. might induce problem of over-sending
-
-    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
-        """
-        Callback function. Receive a trial result.
-        """
-        _logger.info("Candidate %d, reported reward %f", parameter_id, value)
-        self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value
-
-    def trial_end(self, parameter_id, success, **kwargs):
-        """
-        Callback function when a trial is ended and resource is released.
-        """
-        self._pending_result_ids.remove(parameter_id)
-        if not self._pending_result_ids and not self._to_evaluate_queue:
-            # a new epoch now
-            self._next_round()
-            assert self._st_callback is not None
-            self._bind_and_send_parameters()
-
-    def export_results(self, result):
-        """
-        Export a number of candidates to `checkpoints` dir.
-
-        Parameters
-        ----------
-        result : dict
-            Chosen architectures to be exported.
-        """
-        os.makedirs("checkpoints", exist_ok=True)
-        for i, cand in enumerate(result):
-            converted = dict()
-            for cand_key, cand_val in cand.items():
-                onehot = [k == cand_val["_idx"] for k in range(len(self._search_space[cand_key]["_value"]))]
-                converted[cand_key] = onehot
-            with open(os.path.join("checkpoints", "%03d_%03d.json" % (self.epoch, i)), "w") as fp:
-                json.dump(converted, fp)
--- a/nni/algorithms/nas/pytorch/spos/mutator.py
+++ b/nni/algorithms/nas/pytorch/spos/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import numpy as np
-from nni.algorithms.nas.pytorch.random import RandomMutator
-
-_logger = logging.getLogger(__name__)
-
-
-class SPOSSupernetTrainingMutator(RandomMutator):
-    """
-    A random mutator with flops limit.
-
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model.
-    flops_func : callable
-        Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func`
-        is None, functions related to flops will be deactivated.
-    flops_lb : number
-        Lower bound of flops.
-    flops_ub : number
-        Upper bound of flops.
-    flops_bin_num : number
-        Number of bins divided for the interval of flops to ensure the uniformity. Bigger number will be more
-        uniform, but the sampling will be slower.
-    flops_sample_timeout : int
-        Maximum number of attempts to sample before giving up and use a random candidate.
-    """
-    def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None,
-                 flops_bin_num=7, flops_sample_timeout=500):
-
-        super().__init__(model)
-        self._flops_func = flops_func
-        if self._flops_func is not None:
-            self._flops_bin_num = flops_bin_num
-            self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num + 1)]
-            self._flops_sample_timeout = flops_sample_timeout
-
-    def sample_search(self):
-        """
-        Sample a candidate for training. When `flops_func` is not None, candidates will be sampled uniformly
-        relative to flops.
-
-        Returns
-        -------
-        dict
-        """
-        if self._flops_func is not None:
-            for times in range(self._flops_sample_timeout):
-                idx = np.random.randint(self._flops_bin_num)
-                cand = super().sample_search()
-                if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]:
-                    _logger.debug("Sampled candidate flops %f in %d times.", cand, times)
-                    return cand
-            _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout)
-        return super().sample_search()
-
-    def sample_final(self):
-        """
-        Implement only to suffice the interface of Mutator.
-        """
-        return self.sample_search()
--- a/nni/algorithms/nas/pytorch/spos/trainer.py
+++ b/nni/algorithms/nas/pytorch/spos/trainer.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-
-import torch
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup
-
-from .mutator import SPOSSupernetTrainingMutator
-
-logger = logging.getLogger(__name__)
-
-
-class SPOSSupernetTrainer(Trainer):
-    """
-    This trainer trains a supernet that can be used for evolution search.
-
-    Parameters
-    ----------
-    model : nn.Module
-        Model with mutables.
-    mutator : nni.nas.pytorch.mutator.Mutator
-        A mutator object that has been initialized with the model.
-    loss : callable
-        Called with logits and targets. Returns a loss tensor.
-    metrics : callable
-        Returns a dict that maps metrics keys to metrics data.
-    optimizer : Optimizer
-        Optimizer that optimizes the model.
-    num_epochs : int
-        Number of epochs of training.
-    train_loader : iterable
-        Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
-    dataset_valid : iterable
-        Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
-    batch_size : int
-        Batch size.
-    workers: int
-        Number of threads for data preprocessing. Not used for this trainer. Maybe removed in future.
-    device : torch.device
-        Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
-        automatic detects GPU and selects GPU first.
-    log_frequency : int
-        Number of mini-batches to log metrics.
-    callbacks : list of Callback
-        Callbacks to plug into the trainer. See Callbacks.
-    """
-
-    def __init__(self, model, loss, metrics,
-                 optimizer, num_epochs, train_loader, valid_loader,
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
-                 callbacks=None):
-        assert torch.cuda.is_available()
-        super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model),
-                         loss, metrics, optimizer, num_epochs, None, None,
-                         batch_size, workers, device, log_frequency, callbacks)
-
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-
-    def train_one_epoch(self, epoch):
-        self.model.train()
-        meters = AverageMeterGroup()
-        for step, (x, y) in enumerate(self.train_loader):
-            x, y = x.to(self.device), y.to(self.device)
-            self.optimizer.zero_grad()
-            self.mutator.reset()
-            logits = self.model(x)
-            loss = self.loss(logits, y)
-            loss.backward()
-            self.optimizer.step()
-
-            metrics = self.metrics(logits, y)
-            metrics["loss"] = loss.item()
-            meters.update(metrics)
-            if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch + 1,
-                            self.num_epochs, step + 1, len(self.train_loader), meters)
-
-    def validate_one_epoch(self, epoch):
-        self.model.eval()
-        meters = AverageMeterGroup()
-        with torch.no_grad():
-            for step, (x, y) in enumerate(self.valid_loader):
-                x, y = x.to(self.device), y.to(self.device)
-                self.mutator.reset()
-                logits = self.model(x)
-                loss = self.loss(logits, y)
-                metrics = self.metrics(logits, y)
-                metrics["loss"] = loss.item()
-                meters.update(metrics)
-                if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [%s/%s] Validation Step [%s/%s]  %s", epoch + 1,
-                                self.num_epochs, step + 1, len(self.valid_loader), meters)
--- a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
+++ b/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import get_and_apply_next_architecture
--- a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
+++ b/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import json
-import logging
-import os
-import sys
-
-import tensorflow as tf
-
-import nni
-from nni.runtime.env_vars import trial_env_vars
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-from nni.nas.tensorflow.mutator import Mutator
-
-logger = logging.getLogger(__name__)
-
-NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE"
-LAYER_CHOICE = "layer_choice"
-INPUT_CHOICE = "input_choice"
-
-
-def get_and_apply_next_architecture(model):
-    """
-    Wrapper of :class:`~nni.nas.tensorflow.classic_nas.mutator.ClassicMutator` to make it more meaningful,
-    similar to ``get_next_parameter`` for HPO.
-    Tt will generate search space based on ``model``.
-    If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for
-    generating search space for the experiment.
-    If not, there are still two mode, one is nni experiment mode where users
-    use ``nnictl`` to start an experiment. The other is standalone mode
-    where users directly run the trial command, this mode chooses the first
-    one(s) for each LayerChoice and InputChoice.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-    ClassicMutator(model)
-
-
-class ClassicMutator(Mutator):
-    """
-    This mutator is to apply the architecture chosen from tuner.
-    It implements the forward function of LayerChoice and InputChoice,
-    to only activate the chosen ones.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-
-    def __init__(self, model):
-        super(ClassicMutator, self).__init__(model)
-        self._chosen_arch = {}
-        self._search_space = self._generate_search_space()
-        if NNI_GEN_SEARCH_SPACE in os.environ:
-            # dry run for only generating search space
-            self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE])
-            sys.exit(0)
-
-        if trial_env_vars.NNI_PLATFORM is None:
-            logger.warning("This is in standalone mode, the chosen are the first one(s).")
-            self._chosen_arch = self._standalone_generate_chosen()
-        else:
-            # get chosen arch from tuner
-            self._chosen_arch = nni.get_next_parameter()
-            if self._chosen_arch is None:
-                if trial_env_vars.NNI_PLATFORM == "unittest":
-                    # happens if NNI_PLATFORM is intentionally set, e.g., in UT
-                    logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.")
-                    self._chosen_arch = self._standalone_generate_chosen()
-                else:
-                    raise RuntimeError("Chosen architecture is None. This may be a platform error.")
-        self.reset()
-
-    def _sample_layer_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert layer choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        # doesn't support multihot for layer choice yet
-        assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \
-            "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value)
-        mask = tf.one_hot(idx, len(mutable))
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-
-    def _sample_input_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert input choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        candidate_repr = search_space_item["candidates"]
-        multihot_list = [False] * mutable.n_candidates
-        for i, v in zip(idx, value):
-            assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
-                "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
-            assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
-            multihot_list[i] = True
-        return tf.cast(multihot_list, tf.bool)  # pylint: disable=not-callable
-
-    def sample_search(self):
-        """
-        See :meth:`sample_final`.
-        """
-        return self.sample_final()
-
-    def sample_final(self):
-        """
-        Convert the chosen arch and apply it on model.
-        """
-        assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \
-            "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(),
-                                                                                       self._chosen_arch.keys())
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, (LayerChoice, InputChoice)):
-                assert mutable.key in self._chosen_arch, \
-                    "Expected '{}' in chosen arch, but not found.".format(mutable.key)
-                data = self._chosen_arch[mutable.key]
-                assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
-                    "'{}' is not a valid choice.".format(data)
-            if isinstance(mutable, LayerChoice):
-                result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, InputChoice):
-                result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return result
-
-    def _standalone_generate_chosen(self):
-        """
-        Generate the chosen architecture for standalone mode,
-        i.e., choose the first one(s) for LayerChoice and InputChoice.
-        ::
-            { key_name: {"_value": "conv1",
-                         "_idx": 0} }
-            { key_name: {"_value": ["in1"],
-                         "_idx": [0]} }
-        Returns
-        -------
-        dict
-            the chosen architecture
-        """
-        chosen_arch = {}
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                chosen_arch[key] = {"_value": choices[0], "_idx": 0}
-            elif val["_type"] == INPUT_CHOICE:
-                choices = val["_value"]["candidates"]
-                n_chosen = val["_value"]["n_chosen"]
-                if n_chosen is None:
-                    n_chosen = len(choices)
-                chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
-            else:
-                raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
-        return chosen_arch
-
-    def _generate_search_space(self):
-        """
-        Generate search space from mutables.
-        Here is the search space format:
-        ::
-            { key_name: {"_type": "layer_choice",
-                         "_value": ["conv1", "conv2"]} }
-            { key_name: {"_type": "input_choice",
-                         "_value": {"candidates": ["in1", "in2"],
-                                    "n_chosen": 1}} }
-        Returns
-        -------
-        dict
-            the generated search space
-        """
-        search_space = {}
-        for mutable in self.mutables:
-            # for now we only generate flattened search space
-            if isinstance(mutable, LayerChoice):
-                key = mutable.key
-                val = mutable.names
-                search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
-            elif isinstance(mutable, InputChoice):
-                key = mutable.key
-                search_space[key] = {"_type": INPUT_CHOICE,
-                                     "_value": {"candidates": mutable.choose_from,
-                                                "n_chosen": mutable.n_chosen}}
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return search_space
-
-    def _dump_search_space(self, file_path):
-        with open(file_path, "w") as ss_file:
-            json.dump(self._search_space, ss_file, sort_keys=True, indent=2)
--- a/nni/algorithms/nas/tensorflow/enas/__init__.py
+++ b/nni/algorithms/nas/tensorflow/enas/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from .mutator import EnasMutator
-from .trainer import EnasTrainer
--- a/nni/algorithms/nas/tensorflow/enas/mutator.py
+++ b/nni/algorithms/nas/tensorflow/enas/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import tensorflow as tf
-from tensorflow.keras.layers import Dense, Embedding, LSTMCell, RNN
-from tensorflow.keras.losses import SparseCategoricalCrossentropy, Reduction
-
-from nni.nas.tensorflow.mutator import Mutator
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-
-
-class EnasMutator(Mutator):
-    def __init__(self, model,
-                 lstm_size=64,
-                 lstm_num_layers=1,
-                 tanh_constant=1.5,
-                 cell_exit_extra_step=False,
-                 skip_target=0.4,
-                 temperature=None,
-                 branch_bias=0.25,
-                 entropy_reduction='sum'):
-        super().__init__(model)
-        self.tanh_constant = tanh_constant
-        self.temperature = temperature
-        self.cell_exit_extra_step = cell_exit_extra_step
-
-        cells = [LSTMCell(units=lstm_size, use_bias=False) for _ in range(lstm_num_layers)]
-        self.lstm = RNN(cells, stateful=True)
-        self.g_emb = tf.random.normal((1, 1, lstm_size)) * 0.1
-        self.skip_targets = tf.constant([1.0 - skip_target, skip_target])
-
-        self.max_layer_choice = 0
-        self.bias_dict = {}
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                if self.max_layer_choice == 0:
-                    self.max_layer_choice = len(mutable)
-                assert self.max_layer_choice == len(mutable), \
-                        "ENAS mutator requires all layer choice have the same number of candidates."
-                if 'reduce' in mutable.key:
-                    bias = []
-                    for choice in mutable.choices:
-                        if 'conv' in str(type(choice)).lower():
-                            bias.append(branch_bias)
-                        else:
-                            bias.append(-branch_bias)
-                    self.bias_dict[mutable.key] = tf.constant(bias)
-
-        # exposed for trainer
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-
-        # internal nn layers
-        self.embedding = Embedding(self.max_layer_choice + 1, lstm_size)
-        self.soft = Dense(self.max_layer_choice, use_bias=False)
-        self.attn_anchor = Dense(lstm_size, use_bias=False)
-        self.attn_query = Dense(lstm_size, use_bias=False)
-        self.v_attn = Dense(1, use_bias=False)
-        assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.'
-        self.entropy_reduction = tf.reduce_sum if entropy_reduction == 'sum' else tf.reduce_mean
-        self.cross_entropy_loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE)
-
-        self._first_sample = True
-
-    def sample_search(self):
-        self._initialize()
-        self._sample(self.mutables)
-        self._first_sample = False
-        return self._choices
-
-    def sample_final(self):
-        return self.sample_search()
-
-    def _sample(self, tree):
-        mutable = tree.mutable
-        if isinstance(mutable, LayerChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_layer_choice(mutable)
-        elif isinstance(mutable, InputChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_input_choice(mutable)
-        for child in tree.children:
-            self._sample(child)
-        if self.cell_exit_extra_step and isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid:
-            self._anchors_hid[mutable.key] = self.lstm(self._inputs, 1)
-
-    def _initialize(self):
-        self._choices = {}
-        self._anchors_hid = {}
-        self._inputs = self.g_emb
-        # seems the `input_shape` parameter of RNN does not work
-        # workaround it by omitting `reset_states` for first run
-        if not self._first_sample:
-            self.lstm.reset_states()
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-
-    def _sample_layer_choice(self, mutable):
-        logit = self.soft(self.lstm(self._inputs))
-        if self.temperature is not None:
-            logit /= self.temperature
-        if self.tanh_constant is not None:
-            logit = self.tanh_constant * tf.tanh(logit)
-        if mutable.key in self.bias_dict:
-            logit += self.bias_dict[mutable.key]
-        softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-        branch_id = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [1])
-        log_prob = self.cross_entropy_loss(branch_id, logit)
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.math.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        self._inputs = tf.reshape(self.embedding(branch_id), [1, 1, -1])
-        mask = tf.one_hot(branch_id, self.max_layer_choice)
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-
-    def _sample_input_choice(self, mutable):
-        query, anchors = [], []
-        for label in mutable.choose_from:
-            if label not in self._anchors_hid:
-                self._anchors_hid[label] = self.lstm(self._inputs)
-            query.append(self.attn_anchor(self._anchors_hid[label]))
-            anchors.append(self._anchors_hid[label])
-        query = tf.concat(query, axis=0)
-        query = tf.tanh(query + self.attn_query(anchors[-1]))
-        query = self.v_attn(query)
-
-        if self.temperature is not None:
-            query /= self.temperature
-        if self.tanh_constant is not None:
-            query = self.tanh_constant * tf.tanh(query)
-
-        if mutable.n_chosen is None:
-            logit = tf.concat([-query, query], axis=1)
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            skip = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip_prob = tf.math.sigmoid(logit)
-            kl = tf.reduce_sum(skip_prob * tf.math.log(skip_prob / self.skip_targets))
-            self.sample_skip_penalty += kl
-            log_prob = self.cross_entropy_loss(skip, logit)
-
-            skip = tf.cast(skip, tf.float32)
-            inputs = tf.tensordot(skip, tf.concat(anchors, 0), 1) / (1. + tf.reduce_sum(skip))
-            self._inputs = tf.reshape(inputs, [1, 1, -1])
-
-        else:
-            assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS."
-            logit = tf.reshape(query, [1, -1])
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            index = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip = tf.reshape(tf.one_hot(index, mutable.n_candidates), [-1])
-            # when the size is 1, tf does not accept tensor here, complaining the shape is wrong
-            # but using a numpy array seems fine
-            log_prob = self.cross_entropy_loss(logit, query.numpy())
-            self._inputs = tf.reshape(anchors[index.numpy()[0]], [1, 1, -1])
-
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        assert len(skip) == mutable.n_candidates, (skip, mutable.n_candidates, mutable.n_chosen)
-        return tf.cast(skip, tf.bool)
--- a/nni/algorithms/nas/tensorflow/enas/trainer.py
+++ b/nni/algorithms/nas/tensorflow/enas/trainer.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# pylint: skip-file
-
-import logging
-
-import tensorflow as tf
-from tensorflow.keras.optimizers import Adam
-
-from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
-
-from .mutator import EnasMutator
-
-logger = logging.getLogger(__name__)
-
-
-class EnasTrainer:
-    def __init__(
-        self,
-        model,
-        loss,
-        metrics,
-        reward_function,
-        optimizer,
-        batch_size,
-        num_epochs,
-        dataset_train,
-        dataset_valid,
-        log_frequency=100,
-        entropy_weight=0.0001,
-        skip_weight=0.8,
-        baseline_decay=0.999,
-        child_steps=500,
-        mutator_lr=0.00035,
-        mutator_steps=50,
-        mutator_steps_aggregate=20,
-        aux_weight=0.4,
-        test_arc_per_epoch=1,
-    ):
-        self.model = model
-        self.loss = loss
-        self.metrics = metrics
-        self.reward_function = reward_function
-        self.optimizer = optimizer
-        self.batch_size = batch_size
-        self.num_epochs = num_epochs
-
-        x, y = dataset_train
-        split = int(len(x) * 0.9)
-        self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
-        self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
-        self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)
-
-        self.log_frequency = log_frequency
-        self.entropy_weight = entropy_weight
-        self.skip_weight = skip_weight
-        self.baseline_decay = baseline_decay
-        self.child_steps = child_steps
-        self.mutator_lr = mutator_lr
-        self.mutator_steps = mutator_steps
-        self.mutator_steps_aggregate = mutator_steps_aggregate
-        self.aux_weight = aux_weight
-        self.test_arc_per_epoch = test_arc_per_epoch
-
-        self.mutator = EnasMutator(model)
-        self.mutator_optim = Adam(learning_rate=self.mutator_lr)
-
-        self.baseline = 0.0
-
-    def train(self, validate=True):
-        for epoch in range(self.num_epochs):
-            logger.info("Epoch %d Training", epoch + 1)
-            self.train_one_epoch(epoch)
-            logger.info("Epoch %d Validating", epoch + 1)
-            self.validate_one_epoch(epoch)
-
-    def validate(self):
-        self.validate_one_epoch(-1)
-
-    def train_one_epoch(self, epoch):
-        train_loader, valid_loader = self._create_train_loader()
-
-        # Sample model and train
-        meters = AverageMeterGroup()
-
-        for step in range(1, self.child_steps + 1):
-            x, y = next(train_loader)
-            self.mutator.reset()
-
-            with tf.GradientTape() as tape:
-                logits = self.model(x, training=True)
-                if isinstance(logits, tuple):
-                    logits, aux_logits = logits
-                    aux_loss = self.loss(aux_logits, y)
-                else:
-                    aux_loss = 0.0
-                metrics = self.metrics(y, logits)
-                loss = self.loss(y, logits) + self.aux_weight * aux_loss
-
-            grads = tape.gradient(loss, self.model.trainable_weights)
-            grads = fill_zero_grads(grads, self.model.trainable_weights)
-            grads, _ = tf.clip_by_global_norm(grads, 5.0)
-            self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
-
-            metrics["loss"] = tf.reduce_mean(loss).numpy()
-            meters.update(metrics)
-
-            if self.log_frequency and step % self.log_frequency == 0:
-                logger.info(
-                    "Model Epoch [%d/%d] Step [%d/%d]  %s",
-                    epoch + 1,
-                    self.num_epochs,
-                    step,
-                    self.child_steps,
-                    meters,
-                )
-
-        # Train sampler (mutator)
-        meters = AverageMeterGroup()
-        for mutator_step in range(1, self.mutator_steps + 1):
-            grads_list = []
-            for step in range(1, self.mutator_steps_aggregate + 1):
-                with tf.GradientTape() as tape:
-                    x, y = next(valid_loader)
-                    self.mutator.reset()
-
-                    logits = self.model(x, training=False)
-                    metrics = self.metrics(y, logits)
-                    reward = (
-                        self.reward_function(y, logits)
-                        + self.entropy_weight * self.mutator.sample_entropy
-                    )
-                    self.baseline = self.baseline * self.baseline_decay + reward * (
-                        1 - self.baseline_decay
-                    )
-                    loss = self.mutator.sample_log_prob * (reward - self.baseline)
-                    loss += self.skip_weight * self.mutator.sample_skip_penalty
-
-                    meters.update(
-                        {
-                            "reward": reward,
-                            "loss": tf.reduce_mean(loss).numpy(),
-                            "ent": self.mutator.sample_entropy.numpy(),
-                            "log_prob": self.mutator.sample_log_prob.numpy(),
-                            "baseline": self.baseline,
-                            "skip": self.mutator.sample_skip_penalty,
-                        }
-                    )
-
-                    cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate
-                    if self.log_frequency and cur_step % self.log_frequency == 0:
-                        logger.info(
-                            "RL Epoch [%d/%d] Step [%d/%d] [%d/%d]  %s",
-                            epoch + 1,
-                            self.num_epochs,
-                            mutator_step,
-                            self.mutator_steps,
-                            step,
-                            self.mutator_steps_aggregate,
-                            meters,
-                        )
-
-                grads = tape.gradient(loss, self.mutator.trainable_weights)
-                grads = fill_zero_grads(grads, self.mutator.trainable_weights)
-                grads_list.append(grads)
-            total_grads = [
-                tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list)
-            ]
-            total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0)
-            self.mutator_optim.apply_gradients(
-                zip(total_grads, self.mutator.trainable_weights)
-            )
-
-    def validate_one_epoch(self, epoch):
-        test_loader = self._create_validate_loader()
-
-        for arc_id in range(self.test_arc_per_epoch):
-            meters = AverageMeterGroup()
-            for x, y in test_loader:
-                self.mutator.reset()
-                logits = self.model(x, training=False)
-                if isinstance(logits, tuple):
-                    logits, _ = logits
-                metrics = self.metrics(y, logits)
-                loss = self.loss(y, logits)
-                metrics["loss"] = tf.reduce_mean(loss).numpy()
-                meters.update(metrics)
-
-            logger.info(
-                "Test Epoch [%d/%d] Arc [%d/%d] Summary  %s",
-                epoch + 1,
-                self.num_epochs,
-                arc_id + 1,
-                self.test_arc_per_epoch,
-                meters.summary(),
-            )
-
-    def _create_train_loader(self):
-        train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
-        test_set = self.valid_set.shuffle(1000000).repeat().batch(self.batch_size)
-        return iter(train_set), iter(test_set)
-
-    def _create_validate_loader(self):
-        return iter(self.test_set.shuffle(1000000).batch(self.batch_size))
--- a/nni/nas/__init__.py
+++ b/nni/nas/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .execution import *
+from .fixed import fixed_arch
+from .mutable import *
+from .utils import *
--- a/nni/nas/evaluator/__init__.py
+++ b/nni/nas/evaluator/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from nni.common.framework import shortcut_framework
+
+from .evaluator import Evaluator
+from .functional import FunctionalEvaluator
+
+shortcut_framework(__name__)
+
+del shortcut_framework
--- a/nni/nas/evaluator/evaluator.py
+++ b/nni/nas/evaluator/evaluator.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import annotations
+
+__all__ = ['Evaluator']
+
+import abc
+from typing import Any, Callable, Type, Union, cast
+
+
+class Evaluator(abc.ABC):
+    """
+    Evaluator of a model. An evaluator should define where the training code is, and the configuration of
+    training code. The configuration includes basic runtime information trainer needs to know (such as number of GPUs)
+    or tune-able parameters (such as learning rate), depending on the implementation of training code.
+
+    Each config should define how it is interpreted in ``_execute()``, taking only one argument which is the mutated model class.
+    For example, functional evaluator might directly import the function and call the function.
+    """
+
+    def evaluate(self, model_cls: Union[Callable[[], Any], Any]) -> Any:
+        """To run evaluation of a model. The model could be either a concrete model or a callable returning a model.
+
+        The concrete implementation of evaluate depends on the implementation of ``_execute()`` in sub-class.
+        """
+        return self._execute(model_cls)
+
+    def __repr__(self):
+        items = ', '.join(['%s=%r' % (k, v) for k, v in self.__dict__.items()])
+        return f'{self.__class__.__name__}({items})'
+
+    @staticmethod
+    def _load(ir: Any) -> 'Evaluator':
+        evaluator_type = ir.get('type')
+        if isinstance(evaluator_type, str):
+            # for debug purposes only
+            for subclass in Evaluator.__subclasses__():
+                if subclass.__name__ == evaluator_type:
+                    evaluator_type = subclass
+                    break
+        assert issubclass(cast(type, evaluator_type), Evaluator)
+        return cast(Type[Evaluator], evaluator_type)._load(ir)
+
+    @abc.abstractmethod
+    def _dump(self) -> Any:
+        """
+        Subclass implements ``_dump`` for their own serialization.
+        They should return a dict, with a key ``type`` which equals ``self.__class__``,
+        and optionally other keys.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _execute(self, model_cls: Union[Callable[[], Any], Any]) -> Any:
+        pass
+
+    @abc.abstractmethod
+    def __eq__(self, other) -> bool:
+        pass
--- a/nni/nas/evaluator/functional.py
+++ b/nni/nas/evaluator/functional.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import nni
+from .evaluator import Evaluator
+
+
+@nni.trace
+class FunctionalEvaluator(Evaluator):
+    """
+    Functional evaluator that directly takes a function and thus should be general.
+
+    Attributes
+    ----------
+    function
+        The full name of the function.
+    arguments
+        Keyword arguments for the function other than model.
+    """
+
+    def __init__(self, function, **kwargs):
+        self.function = function
+        self.arguments = kwargs
+
+    @staticmethod
+    def _load(ir):
+        return FunctionalEvaluator(ir['function'], **ir['arguments'])
+
+    def _dump(self):
+        return {
+            'type': self.__class__,
+            'function': self.function,
+            'arguments': self.arguments
+        }
+
+    def _execute(self, model_cls):
+        return self.function(model_cls, **self.arguments)
+
+    def __eq__(self, other):
+        return self.function == other.function and self.arguments == other.arguments
--- a/nni/algorithms/nas/pytorch/random/__init__.py
+++ b/nni/algorithms/nas/pytorch/random/__init__.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from .mutator import RandomMutator
+from .lightning import *
--- a/nni/algorithms/nas/__init__.py
+++ b/nni/algorithms/nas/__init__.py
--- a/nni/nas/evaluator/pytorch/cgo/evaluator.py
+++ b/nni/nas/evaluator/pytorch/cgo/evaluator.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import warnings
+from typing import Dict, List, Optional, Union, Type
+
+
+import torch.nn as nn
+import torch.optim as optim
+import torchmetrics
+from torch.utils.data import DataLoader
+
+import nni
+
+from ..lightning import LightningModule, _AccuracyWithLogits, Lightning
+from .trainer import Trainer
+
+__all__ = [
+    '_MultiModelSupervisedLearningModule', 'MultiModelSupervisedLearningModule',
+    '_ClassificationModule', 'Classification',
+    '_RegressionModule', 'Regression',
+]
+
+
+@nni.trace
+class _MultiModelSupervisedLearningModule(LightningModule):
+    def __init__(self, criterion: Type[nn.Module], metrics: Dict[str, torchmetrics.Metric],
+                 n_models: int = 0,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam):
+        super().__init__()
+        self.save_hyperparameters('criterion', 'optimizer', 'learning_rate', 'weight_decay')
+        self.criterion = criterion()
+        self.criterion_cls = criterion
+        self.optimizer = optimizer
+        self.metrics = nn.ModuleDict({name: cls() for name, cls in metrics.items()})
+        self.metrics_args = metrics
+        self.n_models = n_models
+
+    def dump_kwargs(self):
+        kwargs = {}
+        kwargs['criterion'] = self.criterion_cls
+        kwargs['metrics'] = self.metrics_args
+        kwargs['n_models'] = self.n_models
+        kwargs['learning_rate'] = self.hparams['learning_rate']
+        kwargs['weight_decay'] = self.hparams['weight_decay']
+        kwargs['optimizer'] = self.optimizer
+        return kwargs
+
+
+    def forward(self, x):
+        y_hat = self.model(x)
+        return y_hat
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        multi_y_hat = self(x)
+        if isinstance(multi_y_hat, tuple):
+            assert len(multi_y_hat) == self.n_models
+        else:
+            assert self.n_models == 1
+            multi_y_hat = [multi_y_hat]
+        multi_loss = []
+        for idx, y_hat in enumerate(multi_y_hat):
+            loss = self.criterion(y_hat.to("cpu"), y.to("cpu"))
+            self.log(f'train_loss_{idx}', loss, prog_bar=True)
+            for name, metric in self.metrics.items():
+                self.log(f'train_{idx}_' + name, metric(y_hat.to("cpu"), y.to("cpu")), prog_bar=True)
+            multi_loss.append(loss)
+        return sum(multi_loss)
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        multi_y_hat = self(x)
+        if isinstance(multi_y_hat, tuple):
+            assert len(multi_y_hat) == self.n_models
+        else:
+            assert self.n_models == 1
+            multi_y_hat = [multi_y_hat]
+        for idx, y_hat in enumerate(multi_y_hat):
+            self.log(f'val_loss_{idx}', self.criterion(y_hat.to("cpu"), y.to("cpu")), prog_bar=True)
+            for name, metric in self.metrics.items():
+                self.log(f'val_{idx}_' + name, metric(y_hat.to("cpu"), y.to("cpu")), prog_bar=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        multi_y_hat = self(x)
+        if isinstance(multi_y_hat, tuple):
+            assert len(multi_y_hat) == self.n_models
+        else:
+            assert self.n_models == 1
+            multi_y_hat = [multi_y_hat]
+        for idx, y_hat in enumerate(multi_y_hat):
+            self.log(f'test_loss_{idx}', self.criterion(y_hat.to("cpu"), y.to("cpu")), prog_bar=True)
+            for name, metric in self.metrics.items():
+                self.log(f'test_{idx}_' + name, metric(y_hat.to("cpu"), y.to("cpu")), prog_bar=True)
+
+    def configure_optimizers(self):
+        return self.optimizer(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay)
+
+    def on_validation_epoch_end(self):
+        nni.report_intermediate_result(self._get_validation_metrics())
+
+    def teardown(self, stage):
+        if stage == 'fit':
+            nni.report_final_result(self._get_validation_metrics())
+
+    def _get_validation_metrics(self):
+        # TODO: split metric of multiple models?
+        if len(self.metrics) == 1:
+            metric_name = next(iter(self.metrics))
+            ret = []
+            for idx in range(self.n_models):
+                ret.append(self.trainer.callback_metrics[f'val_{idx}_' + metric_name].item())
+            return ret
+        else:
+            warnings.warn('Multiple metrics without "default" is not supported by current framework.')
+            return {name: self.trainer.callback_metrics['val_' + name].item() for name in self.metrics}
+
+
+class MultiModelSupervisedLearningModule(_MultiModelSupervisedLearningModule):
+    """
+    Lightning Module of SupervisedLearning for Cross-Graph Optimization.
+    Users who needs cross-graph optimization should use this module.
+
+    Parameters
+    ----------
+    criterion : nn.Module
+        Class for criterion module (not an instance). default: ``nn.CrossEntropyLoss``
+    learning_rate : float
+        Learning rate. default: 0.001
+    weight_decay : float
+        L2 weight decay. default: 0
+    optimizer : Optimizer
+        Class for optimizer (not an instance). default: ``Adam``
+    """
+
+    def __init__(self, criterion: nn.Module, metrics: Dict[str, torchmetrics.Metric],
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam):
+        super().__init__(criterion, metrics, learning_rate=learning_rate, weight_decay=weight_decay, optimizer=optimizer)
+
+
+class _ClassificationModule(_MultiModelSupervisedLearningModule):
+    def __init__(self, criterion: nn.Module = nn.CrossEntropyLoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam):
+        super().__init__(criterion, {'acc': _AccuracyWithLogits},
+                         learning_rate=learning_rate, weight_decay=weight_decay, optimizer=optimizer)
+
+
+class Classification(Lightning):
+    """
+    Trainer that is used for classification.
+
+    Parameters
+    ----------
+    criterion : nn.Module
+        Class for criterion module (not an instance). default: ``nn.CrossEntropyLoss``
+    learning_rate : float
+        Learning rate. default: 0.001
+    weight_decay : float
+        L2 weight decay. default: 0
+    optimizer : Optimizer
+        Class for optimizer (not an instance). default: ``Adam``
+    train_dataloders : DataLoader
+        Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
+        If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
+    val_dataloaders : DataLoader or List of DataLoader
+        Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
+        If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+    trainer_kwargs : dict
+        Optional keyword arguments passed to trainer. See
+        `Lightning documentation <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`__ for details.
+    """
+
+    def __init__(self, criterion: Type[nn.Module] = nn.CrossEntropyLoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam,
+                 train_dataloader: Optional[DataLoader] = None,
+                 val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
+                 **trainer_kwargs):
+        module = _ClassificationModule(criterion=criterion, learning_rate=learning_rate,
+                                       weight_decay=weight_decay, optimizer=optimizer)
+        super().__init__(module, Trainer(use_cgo=True, **trainer_kwargs),
+                         train_dataloader=train_dataloader, val_dataloaders=val_dataloaders)
+
+class _RegressionModule(_MultiModelSupervisedLearningModule):
+    def __init__(self, criterion: Type[nn.Module] = nn.MSELoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam):
+        super().__init__(criterion, {'mse': torchmetrics.MeanSquaredError},
+                         learning_rate=learning_rate, weight_decay=weight_decay, optimizer=optimizer)
+
+
+class Regression(Lightning):
+    """
+    Trainer that is used for regression.
+
+    Parameters
+    ----------
+    criterion : nn.Module
+        Class for criterion module (not an instance). default: ``nn.MSELoss``
+    learning_rate : float
+        Learning rate. default: 0.001
+    weight_decay : float
+        L2 weight decay. default: 0
+    optimizer : Optimizer
+        Class for optimizer (not an instance). default: ``Adam``
+    train_dataloders : DataLoader
+        Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
+        If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
+    val_dataloaders : DataLoader or List of DataLoader
+        Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
+        If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+    trainer_kwargs : dict
+        Optional keyword arguments passed to trainer. See
+        `Lightning documentation <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`__ for details.
+    """
+
+    def __init__(self, criterion: nn.Module = nn.MSELoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: optim.Optimizer = optim.Adam,
+                 train_dataloader: Optional[DataLoader] = None,
+                 val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
+                 **trainer_kwargs):
+        module = _RegressionModule(criterion=criterion, learning_rate=learning_rate,
+                                   weight_decay=weight_decay, optimizer=optimizer)
+        super().__init__(module, Trainer(use_cgo=True, **trainer_kwargs),
+                         train_dataloader=train_dataloader, val_dataloaders=val_dataloaders)
--- a/nni/nas/evaluator/pytorch/cgo/trainer.py
+++ b/nni/nas/evaluator/pytorch/cgo/trainer.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytorch_lightning as pl
+from pytorch_lightning.strategies import SingleDeviceStrategy
+
+class BypassStrategy(SingleDeviceStrategy):
+    strategy_name = "single_device"
+
+    def model_to_device(self) -> None:
+        pass
+
+class Trainer(pl.Trainer):
+    """
+    Trainer for cross-graph optimization.
+
+    Parameters
+    ----------
+    use_cgo : bool
+        Whether cross-graph optimization (CGO) is used.
+        If it is True, CGO will manage device placement.
+        Any device placement from pytorch lightning will be bypassed.
+        default: False
+    trainer_kwargs : dict
+        Optional keyword arguments passed to trainer. See
+        `Lightning documentation <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`__ for details.
+    """
+
+    def __init__(self, use_cgo=False, **trainer_kwargs):
+        if use_cgo:
+            if "accelerator" in trainer_kwargs:
+                raise ValueError("accelerator should not be set when cross-graph optimization is enabled.")
+
+            if 'strategy' in trainer_kwargs:
+                raise ValueError("cgo.trainer does not support specifying strategy")
+            trainer_kwargs['strategy'] = BypassStrategy()
+
+        super().__init__(**trainer_kwargs)
--- a/nni/nas/evaluator/pytorch/lightning.py
+++ b/nni/nas/evaluator/pytorch/lightning.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, Union, Optional, List, Callable, Type
+
+import pytorch_lightning as pl
+import torch.nn as nn
+import torch.nn.functional as nn_functional
+import torch.optim as optim
+import torchmetrics
+import torch.utils.data as torch_data
+
+import nni
+from nni.common.serializer import is_traceable
+try:
+    from .cgo import trainer as cgo_trainer
+    cgo_import_failed = False
+except ImportError:
+    cgo_import_failed = True
+
+from nni.nas.evaluator import Evaluator
+from nni.typehint import Literal
+
+
+__all__ = [
+    'LightningModule', 'Trainer', 'DataLoader', 'Lightning', 'Classification', 'Regression',
+    '_AccuracyWithLogits', '_SupervisedLearningModule', '_ClassificationModule', '_RegressionModule',
+    # FIXME: hack to make it importable for tests
+]
+
+
+class LightningModule(pl.LightningModule):
+    """
+    Basic wrapper of generated model.
+    Lightning modules used in NNI should inherit this class.
+
+    It's a subclass of ``pytorch_lightning.LightningModule``.
+    See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html
+    """
+
+    running_mode: Literal['multi', 'oneshot'] = 'multi'
+    """An indicator of whether current module is running in a multi-trial experiment or an one-shot.
+    This flag should be automatically set by experiments when they start to run.
+    """
+
+    def set_model(self, model: Union[Callable[[], nn.Module], nn.Module]) -> None:
+        """Set the inner model (architecture) to train / evaluate.
+
+        Parameters
+        ----------
+        model : callable or nn.Module
+            Can be a callable returning nn.Module or nn.Module.
+        """
+        if isinstance(model, nn.Module):
+            self.model = model
+        else:
+            self.model = model()
+
+
+Trainer = nni.trace(pl.Trainer)
+Trainer.__doc__ = """
+Traced version of ``pytorch_lightning.Trainer``. See https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html
+"""
+DataLoader = nni.trace(torch_data.DataLoader)
+DataLoader.__doc__ = """
+Traced version of ``torch.utils.data.DataLoader``. See https://pytorch.org/docs/stable/data.html
+"""
+
+
+@nni.trace
+class Lightning(Evaluator):
+    """
+    Delegate the whole training to PyTorch Lightning.
+
+    Since the arguments passed to the initialization needs to be serialized, ``LightningModule``, ``Trainer`` or
+    ``DataLoader`` in this file should be used. Another option is to hide dataloader in the Lightning module, in
+    which case, dataloaders are not required for this class to work.
+
+    Following the programming style of Lightning, metrics sent to NNI should be obtained from ``callback_metrics``
+    in trainer. Two hooks are added at the end of validation epoch and the end of ``fit``, respectively. The metric name
+    and type depend on the specific task.
+
+    .. warning::
+
+       The Lightning evaluator are stateful. If you try to use a previous Lightning evaluator,
+       please note that the inner ``lightning_module`` and ``trainer`` will be reused.
+
+    Parameters
+    ----------
+    lightning_module
+        Lightning module that defines the training logic.
+    trainer
+        Lightning trainer that handles the training.
+    train_dataloders
+        Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
+        If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
+        It can be `any types of dataloader supported by Lightning <https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html>`__.
+    val_dataloaders
+        Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
+        If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+        It can be `any types of dataloader supported by Lightning <https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html>`__.
+    """
+
+    def __init__(self, lightning_module: LightningModule, trainer: Trainer,
+                 train_dataloaders: Optional[Any] = None,
+                 val_dataloaders: Optional[Any] = None,
+                 train_dataloader: Optional[Any] = None):
+        assert isinstance(lightning_module, LightningModule), f'Lightning module must be an instance of {__name__}.LightningModule.'
+        if train_dataloader is not None:
+            warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+            train_dataloaders = train_dataloader
+        if cgo_import_failed:
+            assert isinstance(trainer, pl.Trainer) and is_traceable(trainer), f'Trainer must be imported from {__name__}'
+        else:
+            # this is not isinstance(trainer, Trainer) because with a different trace call, it can be different
+            assert (isinstance(trainer, pl.Trainer) and is_traceable(trainer)) or isinstance(trainer, cgo_trainer.Trainer), \
+                f'Trainer must be imported from {__name__} or nni.nas.evaluator.pytorch.cgo.trainer'
+        if not _check_dataloader(train_dataloaders):
+            warnings.warn(f'Please try to wrap PyTorch DataLoader with nni.trace or '
+                          f'import DataLoader from {__name__}: {train_dataloaders}',
+                          RuntimeWarning)
+        if not _check_dataloader(val_dataloaders):
+            warnings.warn(f'Please try to wrap PyTorch DataLoader with nni.trace or '
+                          f'import DataLoader from {__name__}: {val_dataloaders}',
+                          RuntimeWarning)
+        self.module = lightning_module
+        self.trainer = trainer
+        self.train_dataloaders = train_dataloaders
+        self.val_dataloaders = val_dataloaders
+
+    @staticmethod
+    def _load(ir):
+        return Lightning(ir['module'], ir['trainer'], ir['train_dataloaders'], ir['val_dataloaders'])
+
+    def _dump(self):
+        return {
+            'type': self.__class__,
+            'module': self.module,
+            'trainer': self.trainer,
+            'train_dataloaders': self.train_dataloaders,
+            'val_dataloaders': self.val_dataloaders
+        }
+
+    def _execute(self, model_cls):
+        return self.fit(model_cls)
+
+    @property
+    def train_dataloader(self):
+        warnings.warn('train_dataloader is deprecated, please use `train_dataloaders`.', DeprecationWarning)
+
+    def __eq__(self, other):
+        eq_func = False
+        eq_args = False
+        if other is None:
+            return False
+        if hasattr(self, "function") and hasattr(other, "function"):
+            eq_func = getattr(self, "function") == getattr(other, "function")
+        elif not (hasattr(self, "function") or hasattr(other, "function")):
+            eq_func = True
+
+        if hasattr(self, "arguments") and hasattr(other, "arguments"):
+            eq_args = getattr(self, "arguments") == getattr(other, "arguments")
+        elif not (hasattr(self, "arguments") or hasattr(other, "arguments")):
+            eq_args = True
+
+        return eq_func and eq_args
+
+    def fit(self, model):
+        """
+        Fit the model with provided dataloader, with Lightning trainer.
+
+        Parameters
+        ----------
+        model : nn.Module
+            The model to fit.
+        """
+        self.module.set_model(model)
+        return self.trainer.fit(self.module, self.train_dataloaders, self.val_dataloaders)
+
+
+def _check_dataloader(dataloader):
+    # Check the type of dataloader recursively.
+    if isinstance(dataloader, list):
+        return all([_check_dataloader(d) for d in dataloader])
+    if isinstance(dataloader, dict):
+        return all([_check_dataloader(v) for v in dataloader.values()])
+    if isinstance(dataloader, torch_data.DataLoader):
+        return is_traceable(dataloader)
+    return True
+
+
+### The following are some commonly used Lightning modules ###
+
+class _SupervisedLearningModule(LightningModule):
+
+    trainer: pl.Trainer
+
+    def __init__(self, criterion: Type[nn.Module], metrics: Dict[str, Type[torchmetrics.Metric]],
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: Type[optim.Optimizer] = optim.Adam,
+                 export_onnx: Union[Path, str, bool, None] = None):
+        super().__init__()
+        self.save_hyperparameters('criterion', 'optimizer', 'learning_rate', 'weight_decay')
+        self.criterion = criterion()
+        self.optimizer = optimizer
+        self.metrics = nn.ModuleDict({name: cls() for name, cls in metrics.items()})
+
+        if export_onnx is None or export_onnx is True:
+            self.export_onnx = Path(os.environ.get('NNI_OUTPUT_DIR', '.')) / 'model.onnx'
+        elif export_onnx:
+            self.export_onnx = Path(export_onnx)
+        else:
+            self.export_onnx = None
+
+    def forward(self, x):
+        y_hat = self.model(x)
+        return y_hat
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        loss = self.criterion(y_hat, y)
+        self.log('train_loss', loss, prog_bar=True)
+        for name, metric in self.metrics.items():
+            self.log('train_' + name, metric(y_hat, y), prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+
+        if self.running_mode == 'multi' and self.export_onnx is not None:
+            self.export_onnx.parent.mkdir(exist_ok=True)
+            try:
+                self.to_onnx(self.export_onnx, x, export_params=True)
+            except RuntimeError as e:
+                warnings.warn(f'ONNX conversion failed. As a result, you might not be able to use visualization. Error message: {e}')
+            self.export_onnx = None
+
+        self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
+        for name, metric in self.metrics.items():
+            self.log('val_' + name, metric(y_hat, y), prog_bar=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self(x)
+        self.log('test_loss', self.criterion(y_hat, y), prog_bar=True)
+        for name, metric in self.metrics.items():
+            self.log('test_' + name, metric(y_hat, y), prog_bar=True)
+
+    def configure_optimizers(self):
+        return self.optimizer(self.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay)  # type: ignore
+
+    def on_validation_epoch_end(self):
+        if not self.trainer.sanity_checking and self.running_mode == 'multi':
+            # Don't report metric when sanity checking
+            nni.report_intermediate_result(self._get_validation_metrics())
+
+    def on_fit_end(self):
+        if self.running_mode == 'multi':
+            nni.report_final_result(self._get_validation_metrics())
+
+    def _get_validation_metrics(self):
+        if len(self.metrics) == 1:
+            metric_name = next(iter(self.metrics))
+            return self.trainer.callback_metrics['val_' + metric_name].item()
+        else:
+            warnings.warn('Multiple metrics without "default" is not supported by current framework.')
+            return {name: self.trainer.callback_metrics['val_' + name].item() for name in self.metrics}
+
+
+class _AccuracyWithLogits(torchmetrics.Accuracy):
+    def update(self, pred, target):
+        return super().update(nn_functional.softmax(pred, dim=-1), target)
+
+
+@nni.trace
+class _ClassificationModule(_SupervisedLearningModule):
+    def __init__(self, criterion: Type[nn.Module] = nn.CrossEntropyLoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: Type[optim.Optimizer] = optim.Adam,
+                 export_onnx: bool = True):
+        super().__init__(criterion, {'acc': _AccuracyWithLogits},
+                         learning_rate=learning_rate, weight_decay=weight_decay, optimizer=optimizer,
+                         export_onnx=export_onnx)
+
+
+class Classification(Lightning):
+    """
+    Evaluator that is used for classification.
+
+    Parameters
+    ----------
+    criterion : nn.Module
+        Class for criterion module (not an instance). default: ``nn.CrossEntropyLoss``
+    learning_rate : float
+        Learning rate. default: 0.001
+    weight_decay : float
+        L2 weight decay. default: 0
+    optimizer : Optimizer
+        Class for optimizer (not an instance). default: ``Adam``
+    train_dataloaders : DataLoader
+        Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
+        If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
+    val_dataloaders : DataLoader or List of DataLoader
+        Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
+        If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+    export_onnx : bool
+        If true, model will be exported to ``model.onnx`` before training starts. default true
+    trainer_kwargs : dict
+        Optional keyword arguments passed to trainer. See
+        `Lightning documentation <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`__ for details.
+
+    Examples
+    --------
+    >>> evaluator = Classification()
+
+    To use customized criterion and optimizer:
+
+    >>> evaluator = Classification(nn.LabelSmoothingCrossEntropy, optimizer=torch.optim.SGD)
+
+    Extra keyword arguments will be passed to trainer, some of which might be necessary to enable GPU acceleration:
+
+    >>> evaluator = Classification(accelerator='gpu', devices=2, strategy='ddp')
+    """
+
+    def __init__(self, criterion: Type[nn.Module] = nn.CrossEntropyLoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: Type[optim.Optimizer] = optim.Adam,
+                 train_dataloaders: Optional[DataLoader] = None,
+                 val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
+                 export_onnx: bool = True,
+                 train_dataloader: Optional[DataLoader] = None,
+                 **trainer_kwargs):
+        if train_dataloader is not None:
+            warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+            train_dataloaders = train_dataloader
+        module = _ClassificationModule(criterion=criterion, learning_rate=learning_rate,
+                                       weight_decay=weight_decay, optimizer=optimizer, export_onnx=export_onnx)
+        super().__init__(module, Trainer(**trainer_kwargs),
+                         train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders)
+
+
+@nni.trace
+class _RegressionModule(_SupervisedLearningModule):
+    def __init__(self, criterion: Type[nn.Module] = nn.MSELoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: Type[optim.Optimizer] = optim.Adam,
+                 export_onnx: bool = True):
+        super().__init__(criterion, {'mse': torchmetrics.MeanSquaredError},
+                         learning_rate=learning_rate, weight_decay=weight_decay, optimizer=optimizer,
+                         export_onnx=export_onnx)
+
+
+class Regression(Lightning):
+    """
+    Evaluator that is used for regression.
+
+    Parameters
+    ----------
+    criterion : nn.Module
+        Class for criterion module (not an instance). default: ``nn.MSELoss``
+    learning_rate : float
+        Learning rate. default: 0.001
+    weight_decay : float
+        L2 weight decay. default: 0
+    optimizer : Optimizer
+        Class for optimizer (not an instance). default: ``Adam``
+    train_dataloaders : DataLoader
+        Used in ``trainer.fit()``. A PyTorch DataLoader with training samples.
+        If the ``lightning_module`` has a predefined train_dataloader method this will be skipped.
+    val_dataloaders : DataLoader or List of DataLoader
+        Used in ``trainer.fit()``. Either a single PyTorch Dataloader or a list of them, specifying validation samples.
+        If the ``lightning_module`` has a predefined val_dataloaders method this will be skipped.
+    export_onnx : bool
+        If true, model will be exported to ``model.onnx`` before training starts. default: true
+    trainer_kwargs : dict
+        Optional keyword arguments passed to trainer. See
+        `Lightning documentation <https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html>`__ for details.
+
+    Examples
+    --------
+    >>> evaluator = Regression()
+
+    Extra keyword arguments will be passed to trainer, some of which might be necessary to enable GPU acceleration:
+
+    >>> evaluator = Regression(gpus=1)
+    """
+
+    def __init__(self, criterion: Type[nn.Module] = nn.MSELoss,
+                 learning_rate: float = 0.001,
+                 weight_decay: float = 0.,
+                 optimizer: Type[optim.Optimizer] = optim.Adam,
+                 train_dataloaders: Optional[DataLoader] = None,
+                 val_dataloaders: Union[DataLoader, List[DataLoader], None] = None,
+                 export_onnx: bool = True,
+                 train_dataloader: Optional[DataLoader] = None,
+                 **trainer_kwargs):
+        if train_dataloader is not None:
+            warnings.warn('`train_dataloader` is deprecated and replaced with `train_dataloaders`.', DeprecationWarning)
+            train_dataloaders = train_dataloader
+        module = _RegressionModule(criterion=criterion, learning_rate=learning_rate,
+                                   weight_decay=weight_decay, optimizer=optimizer, export_onnx=export_onnx)
+        super().__init__(module, Trainer(**trainer_kwargs),
+                         train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders)
--- a/nni/algorithms/nas/pytorch/__init__.py
+++ b/nni/algorithms/nas/pytorch/__init__.py
--- a/nni/nas/pytorch/nasbench201/__init__.py
+++ b/nni/nas/pytorch/nasbench201/__init__.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from .nasbench201 import NASBench201Cell
+from .api import *
+from .common import *