Resolve conflicts for #4760 (#4762)

a911b856 · Yuge Zhang · GitHub · 14d2966b · a911b856 · a911b856
Unverified Commit a911b856 authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
8 changed files
--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -17,6 +17,8 @@ _logger = logging.getLogger(__name__)
 class BaseGraphData:
    """
+    Data sent between strategy and trial, in graph-based execution engine.
    Attributes
    ----------
    model_script

--- a/nni/retiarii/execution/cgo_engine.py
+++ b/nni/retiarii/execution/cgo_engine.py
@@ -200,7 +200,7 @@ class CGOExecutionEngine(AbstractExecutionEngine):
            # replace the module with a new instance whose n_models is set
            # n_models must be set in __init__, otherwise it cannot be captured by serialize_cls
-            new_module_init_params = model.evaluator.module.trace_kwargs.copy()
+            new_module_init_params = model.evaluator.module.dump_kwargs().copy()
            # MultiModelSupervisedLearningModule hides n_models of _MultiModelSupervisedLearningModule from users
            new_module_init_params['n_models'] = len(multi_model)

--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -45,6 +45,9 @@ from ..strategy.utils import dry_run_for_formatted_search_space
 _logger = logging.getLogger(__name__)
+__all__ = ['RetiariiExeConfig', 'RetiariiExperiment']
 @dataclass(init=False)
 class RetiariiExeConfig(ConfigBase):
    experiment_name: Optional[str] = None
@@ -145,6 +148,10 @@ def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dumm
                           'do not use mutators when you use LayerChoice/InputChoice')
    if mutators is not None:
        applied_mutators = mutators
+    # Add mutations on evaluators
+    applied_mutators += process_evaluator_mutations(evaluator, applied_mutators)
    return base_model_ir, applied_mutators
@@ -284,7 +291,6 @@ class RetiariiExperiment(Experiment):
            full_ir=self.config.execution_engine not in ['py', 'benchmark'],
            dummy_input=self.config.dummy_input
        )
-        self.applied_mutators += process_evaluator_mutations(self.evaluator, self.applied_mutators)
        _logger.info('Start strategy...')
        search_space = dry_run_for_formatted_search_space(base_model_ir, self.applied_mutators)
@@ -475,6 +481,8 @@ class RetiariiExperiment(Experiment):
        For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are
        available for customization.
+        Parameters
+        ----------
        top_k : int
            How many models are intended to be exported.
        optimize_mode : str

--- a/nni/retiarii/graph.py
+++ b/nni/retiarii/graph.py
@@ -75,12 +75,12 @@ class Model:
    """
    Represents a neural network model.
-    During mutation, one `Model` object is created for each trainable snapshot.
+    During mutation, one :class:`Model` object is created for each trainable snapshot.
    For example, consider a mutator that insert a node at an edge for each iteration.
    In one iteration, the mutator invokes 4 primitives: add node, remove edge, add edge to head, add edge to tail.
-    These 4 primitives operates in one `Model` object.
+    These 4 primitives operates in one :class:`Model` object.
    When they are all done the model will be set to "frozen" (trainable) status and be submitted to execution engine.
-    And then a new iteration starts, and a new `Model` object is created by forking last model.
+    And then a new iteration starts, and a new :class:`Model` object is created by forking last model.
    Attributes
    ----------
@@ -91,7 +91,7 @@ class Model:
    python_init_params
        Initialization parameters of python class.
    status
-        See `ModelStatus`.
+        See :class:`ModelStatus`.
    root_graph
        The outermost graph which usually takes dataset as input and feeds output to loss function.
    graphs
@@ -100,11 +100,11 @@ class Model:
        Model evaluator
    history
        Mutation history.
-        `self` is directly mutated from `self.history[-1]`;
+        ``self`` is directly mutated from ``self.history[-1]``;
-        `self.history[-1] is mutated from `self.history[-2]`, and so on.
+        ``self.history[-1]`` is mutated from ``self.history[-2]``, and so on.
-        `self.history[0]` is the base graph.
+        ``self.history[0]`` is the base graph.
    metric
-        Training result of the model, or `None` if it's not yet trained or has failed to train.
+        Training result of the model, or ``None`` if it's not yet trained or has failed to train.
    intermediate_metrics
        Intermediate training metrics. If the model is not trained, it's an empty list.
    """
@@ -265,9 +265,9 @@ class Graph:
    Graph topology.
    This class simply represents the topology, with no semantic meaning.
-    All other information like metric, non-graph functions, mutation history, etc should go to `Model`.
+    All other information like metric, non-graph functions, mutation history, etc should go to :class:`Model`.
-    Each graph belongs to and only belongs to one `Model`.
+    Each graph belongs to and only belongs to one :class:`Model`.
    Attributes
    ----------
@@ -284,15 +284,15 @@ class Graph:
    output_names
        Optional mnemonic names of output values.
    input_node
-        ...
+        Incoming node.
    output_node
-        ...
+        Output node.
    hidden_nodes
-        ...
+        Hidden nodes
    nodes
        All input/output/hidden nodes.
    edges
-        ...
+        Edges.
    python_name
        The name of torch.nn.Module, should have one-to-one mapping with items in python model.
    """
@@ -532,16 +532,16 @@ class Node:
    """
    An operation or an opaque subgraph inside a graph.
-    Each node belongs to and only belongs to one `Graph`.
+    Each node belongs to and only belongs to one :class:`Graph`.
-    Nodes should never be created with constructor. Use `Graph.add_node()` instead.
+    Nodes should never be created with constructor. Use :meth:`Graph.add_node` instead.
    The node itself is for topology only.
-    Information of tensor calculation should all go inside `operation` attribute.
+    Information of tensor calculation should all go inside ``operation`` attribute.
    TODO: parameter of subgraph (cell)
    It's easy to assign parameters on cell node, but it's hard to "use" them.
    We need to design a way to reference stored cell parameters in inner node operations.
-    e.g. `self.fc = Linear(self.units)`  <-  how to express `self.units` in IR?
+    e.g. ``self.fc = Linear(self.units)``  <-  how to express ``self.units`` in IR?
    Attributes
    ----------
@@ -557,10 +557,10 @@ class Node:
    label
        Optional. If two nodes have the same label, they are considered same by the mutator.
    operation
-        ...
+        Operation.
    cell
        Read only shortcut to get the referenced subgraph.
-        If this node is not a subgraph (is a primitive operation), accessing `cell` will raise an error.
+        If this node is not a subgraph (is a primitive operation), accessing ``cell`` will raise an error.
    predecessors
        Predecessor nodes of this node in the graph. This is an optional mutation helper.
    successors
@@ -677,36 +677,36 @@ class Edge:
    """
    A tensor, or "data flow", between two nodes.
-    Example forward code snippet:
+    Example forward code snippet: ::
-    ```
-    a, b, c = split(x)
+        a, b, c = split(x)
-    p = concat(a, c)
+        p = concat(a, c)
-    q = sum(b, p)
+        q = sum(b, p)
-    z = relu(q)
+        z = relu(q)
-    ```
+    Edges in above snippet: ::
-    Edges in above snippet:
-      + head: (split, 0), tail: (concat, 0)  # a in concat
+        + head: (split, 0), tail: (concat, 0)  # a in concat
-      + head: (split, 2), tail: (concat, 1)  # c in concat
+        + head: (split, 2), tail: (concat, 1)  # c in concat
-      + head: (split, 1), tail: (sum, -1 or 0)  # b in sum
+        + head: (split, 1), tail: (sum, -1 or 0)  # b in sum
-      + head: (concat, null), tail: (sum, -1 or 1)  # p in sum
+        + head: (concat, null), tail: (sum, -1 or 1)  # p in sum
-      + head: (sum, null), tail: (relu, null)  # q in relu
+        + head: (sum, null), tail: (relu, null)  # q in relu
    Attributes
    ----------
    graph
-        ...
+        Graph.
    head
        Head node.
    tail
        Tail node.
    head_slot
        Index of outputs in head node.
-        If the node has only one output, this should be `null`.
+        If the node has only one output, this should be ``null``.
    tail_slot
        Index of inputs in tail node.
-        If the node has only one input, this should be `null`.
+        If the node has only one input, this should be ``null``.
-        If the node does not care about order, this can be `-1`.
+        If the node does not care about order, this can be ``-1``.
    """
    def __init__(self, head: EdgeEndpoint, tail: EdgeEndpoint, _internal: bool = False):

--- a/nni/retiarii/hub/README.md
+++ b/nni/retiarii/hub/README.md
+This README will be deleted once this hub got stabilized, after which we will promote it in the documentation.
+## Why
+We hereby provides a series of state-of-the-art search space, which is PyTorch model + mutations + training recipe.
+For further motivations and plans, please see https://github.com/microsoft/nni/issues/4249.
+## Reproduction Roadmap
+1. Runnable
+2. Load checkpoint of searched architecture and evaluate
+3. Reproduce searched architecture
+4. Runnable with built-in algos
+5. Reproduce result with at least one algo
+|                        | 1      | 2      | 3      | 4      | 5      |
+|------------------------|--------|--------|--------|--------|--------|
+| NasBench101            | Y      |        |        |        |        |
+| NasBench201            | Y      |        |        |        |        |
+| NASNet                 | Y      |        |        |        |        |
+| ENAS                   | Y      |        |        |        |        |
+| AmoebaNet              | Y      |        |        |        |        |
+| PNAS                   | Y      |        |        |        |        |
+| DARTS                  | Y      |        |        |        |        |
+| ProxylessNAS           | Y      |        |        |        |        |
+| MobileNetV3Space       | Y      |        |        |        |        |
+| ShuffleNetSpace        | Y      |        |        |        |        |
+| ShuffleNetSpace (ch)   | Y      |        |        |        |        |
--- a/nni/retiarii/hub/__init__.py
+++ b/nni/retiarii/hub/__init__.py
--- a/nni/retiarii/hub/pytorch/__init__.py
+++ b/nni/retiarii/hub/pytorch/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from .mobilenetv3 import MobileNetV3Space
+from .nasbench101 import NasBench101
+from .nasbench201 import NasBench201
+from .nasnet import NDS, NASNet, ENAS, AmoebaNet, PNAS, DARTS
+from .proxylessnas import ProxylessNAS
+from .shufflenet import ShuffleNetSpace
--- a/nni/retiarii/hub/pytorch/mobilenetv3.py
+++ b/nni/retiarii/hub/pytorch/mobilenetv3.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from typing import Tuple, Optional, Callable
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+from .proxylessnas import ConvBNReLU, InvertedResidual, SeparableConv, make_divisible, reset_parameters
+class h_sigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_sigmoid, self).__init__()
+        self.relu = nn.ReLU6(inplace=inplace)
+    def forward(self, x):
+        return self.relu(x + 3) / 6
+class h_swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_swish, self).__init__()
+        self.sigmoid = h_sigmoid(inplace=inplace)
+    def forward(self, x):
+        return x * self.sigmoid(x)
+class SELayer(nn.Module):
+    """Squeeze-and-excite layer."""
+    def __init__(self,
+                 channels: int,
+                 reduction: int = 4,
+                 activation_layer: Optional[Callable[..., nn.Module]] = None):
+        super().__init__()
+        if activation_layer is None:
+            activation_layer = nn.Sigmoid
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channels, make_divisible(channels // reduction, 8)),
+            nn.ReLU(inplace=True),
+            nn.Linear(make_divisible(channels // reduction, 8), channels),
+            activation_layer()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+@model_wrapper
+class MobileNetV3Space(nn.Module):
+    """
+    MobileNetV3Space implements the largest search space in `TuNAS <https://arxiv.org/abs/2008.06120>`__.
+    The search dimensions include widths, expand ratios, kernel sizes, SE ratio.
+    Some of them can be turned off via arguments to narrow down the search space.
+    Different from ProxylessNAS search space, this space is implemented with :class:`nn.ValueChoice`.
+    We use the following snipppet as reference.
+    https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/mobile_search_space_v3.py#L728
+    """
+    def __init__(self, num_labels: int = 1000,
+                 base_widths: Tuple[int, ...] = (16, 16, 32, 64, 128, 256, 512, 1024),
+                 width_multipliers: Tuple[float, ...] = (0.5, 0.625, 0.75, 1.0, 1.25, 1.5, 2.0),
+                 expand_ratios: Tuple[int, ...] = (1, 2, 3, 4, 5, 6),
+                 dropout_rate: float = 0.2,
+                 bn_eps: float = 1e-3,
+                 bn_momentum: float = 0.1):
+        super().__init__()
+        self.widths = [
+            nn.ValueChoice([make_divisible(base_width * mult, 8) for mult in width_multipliers], label=f'width_{i}')
+            for i, base_width in enumerate(base_widths)
+        ]
+        self.expand_ratios = expand_ratios
+        blocks = [
+            # Stem
+            ConvBNReLU(
+                3, self.widths[0],
+                nn.ValueChoice([3, 5], label='ks_0'),
+                stride=2, activation_layer=h_swish
+            ),
+            SeparableConv(self.widths[0], self.widths[0], activation_layer=nn.ReLU),
+        ]
+        # counting for kernel sizes and expand ratios
+        self.layer_count = 2
+        blocks += [
+            # Body
+            self._make_stage(1, self.widths[0], self.widths[1], False, 2, nn.ReLU),
+            self._make_stage(2, self.widths[1], self.widths[2], True, 2, nn.ReLU),
+            self._make_stage(1, self.widths[2], self.widths[3], False, 2, h_swish),
+            self._make_stage(1, self.widths[3], self.widths[4], True, 1, h_swish),
+            self._make_stage(1, self.widths[4], self.widths[5], True, 2, h_swish),
+        ]
+        # Head
+        blocks += [
+            ConvBNReLU(self.widths[5], self.widths[6], 1, 1, activation_layer=h_swish),
+            nn.AdaptiveAvgPool2d(1),
+            ConvBNReLU(self.widths[6], self.widths[7], 1, 1, norm_layer=nn.Identity, activation_layer=h_swish),
+        ]
+        self.blocks = nn.Sequential(*blocks)
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout_rate),
+            nn.Linear(self.widths[7], num_labels),
+        )
+        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
+    def forward(self, x):
+        x = self.blocks(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+    def _make_stage(self, stage_idx, inp, oup, se, stride, act):
+        # initialize them first because they are related to layer_count.
+        exp, ks, se_blocks = [], [], []
+        for _ in range(4):
+            exp.append(nn.ValueChoice(list(self.expand_ratios), label=f'exp_{self.layer_count}'))
+            ks.append(nn.ValueChoice([3, 5, 7], label=f'ks_{self.layer_count}'))
+            if se:
+                # if SE is true, assign a layer choice to SE
+                se_blocks.append(
+                    lambda hidden_ch: nn.LayerChoice([nn.Identity(), SELayer(hidden_ch)], label=f'se_{self.layer_count}')
+                )
+            else:
+                se_blocks.append(None)
+            self.layer_count += 1
+        blocks = [
+            # stride = 2
+            InvertedResidual(inp, oup, exp[0], ks[0],
+                             stride, squeeze_and_excite=se_blocks[0], activation_layer=act),
+            # stride = 1, residual connection should be automatically enabled
+            InvertedResidual(oup, oup, exp[1], ks[1], squeeze_and_excite=se_blocks[1], activation_layer=act),
+            InvertedResidual(oup, oup, exp[2], ks[2], squeeze_and_excite=se_blocks[2], activation_layer=act),
+            InvertedResidual(oup, oup, exp[3], ks[3], squeeze_and_excite=se_blocks[3], activation_layer=act)
+        ]
+        # mutable depth
+        return nn.Repeat(blocks, depth=(1, 4), label=f'depth_{stage_idx}')