Unverified Commit f9bbd8d0 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

[Retiarii] Search space hub (#4524)

parent 9fde0e8e
......@@ -8,3 +8,4 @@ _generated_model_*.py
_generated_model
generated
lightning_logs
model.onnx
......@@ -177,7 +177,6 @@ class _SupervisedLearningModule(LightningModule):
self.export_onnx = Path(export_onnx)
else:
self.export_onnx = None
self._already_exported = False
def forward(self, x):
y_hat = self.model(x)
......@@ -196,12 +195,12 @@ class _SupervisedLearningModule(LightningModule):
x, y = batch
y_hat = self(x)
if not self._already_exported:
if self.export_onnx is not None:
try:
self.to_onnx(self.export_onnx, x, export_params=True)
except RuntimeError as e:
warnings.warn(f'ONNX conversion failed. As a result, you might not be able to use visualization. Error message: {e}')
self._already_exported = True
self.export_onnx = None
self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
for name, metric in self.metrics.items():
......
This README will be deleted once this hub got stabilized, after which we will promote it in the documentation.
## Why
We hereby provides a series of state-of-the-art search space, which is PyTorch model + mutations + training recipe.
For further motivations and plans, please see https://github.com/microsoft/nni/issues/4249.
## Reproduction Roadmap
1. Runnable
2. Load checkpoint of searched architecture and evaluate
3. Reproduce searched architecture
4. Runnable with built-in algos
5. Reproduce result with at least one algo
| | 1 | 2 | 3 | 4 | 5 |
|------------------------|--------|--------|--------|--------|--------|
| NasBench101 | Y | | | | |
| NasBench201 | Y | | | | |
| NASNet | Y | | | | |
| ENAS | Y | | | | |
| AmoebaNet | Y | | | | |
| PNAS | Y | | | | |
| DARTS | Y | | | | |
| ProxylessNAS | Y | | | | |
| MobileNetV3Space | Y | | | | |
| ShuffleNetSpace | Y | | | | |
| ShuffleNetSpace (ch) | Y | | | | |
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .mobilenetv3 import MobileNetV3Space
from .nasbench101 import NasBench101
from .nasbench201 import NasBench201
from .nasnet import NDS, NASNet, ENAS, AmoebaNet, PNAS, DARTS
from .proxylessnas import ProxylessNAS
from .shufflenet import ShuffleNetSpace
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Tuple, Optional, Callable
import nni.retiarii.nn.pytorch as nn
from nni.retiarii import model_wrapper
from .proxylessnas import ConvBNReLU, InvertedResidual, SeparableConv, make_divisible, reset_parameters
class h_sigmoid(nn.Module):
def __init__(self, inplace=True):
super(h_sigmoid, self).__init__()
self.relu = nn.ReLU6(inplace=inplace)
def forward(self, x):
return self.relu(x + 3) / 6
class h_swish(nn.Module):
def __init__(self, inplace=True):
super(h_swish, self).__init__()
self.sigmoid = h_sigmoid(inplace=inplace)
def forward(self, x):
return x * self.sigmoid(x)
class SELayer(nn.Module):
"""Squeeze-and-excite layer."""
def __init__(self,
channels: int,
reduction: int = 4,
activation_layer: Optional[Callable[..., nn.Module]] = None):
super().__init__()
if activation_layer is None:
activation_layer = nn.Sigmoid
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channels, make_divisible(channels // reduction, 8)),
nn.ReLU(inplace=True),
nn.Linear(make_divisible(channels // reduction, 8), channels),
activation_layer()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y
@model_wrapper
class MobileNetV3Space(nn.Module):
"""
MobileNetV3Space implements the largest search space in `TuNAS <https://arxiv.org/abs/2008.06120>`__.
The search dimensions include widths, expand ratios, kernel sizes, SE ratio.
Some of them can be turned off via arguments to narrow down the search space.
Different from ProxylessNAS search space, this space is implemented with :class:`nn.ValueChoice`.
We use the following snipppet as reference.
https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/mobile_search_space_v3.py#L728
"""
def __init__(self, num_labels: int = 1000,
base_widths: Tuple[int, ...] = (16, 16, 32, 64, 128, 256, 512, 1024),
width_multipliers: Tuple[float, ...] = (0.5, 0.625, 0.75, 1.0, 1.25, 1.5, 2.0),
expand_ratios: Tuple[int, ...] = (1, 2, 3, 4, 5, 6),
dropout_rate: float = 0.2,
bn_eps: float = 1e-3,
bn_momentum: float = 0.1):
super().__init__()
self.widths = [
nn.ValueChoice([make_divisible(base_width * mult, 8) for mult in width_multipliers], label=f'width_{i}')
for i, base_width in enumerate(base_widths)
]
self.expand_ratios = expand_ratios
blocks = [
# Stem
ConvBNReLU(
3, self.widths[0],
nn.ValueChoice([3, 5], label='ks_0'),
stride=2, activation_layer=h_swish
),
SeparableConv(self.widths[0], self.widths[0], activation_layer=nn.ReLU),
]
# counting for kernel sizes and expand ratios
self.layer_count = 2
blocks += [
# Body
self._make_stage(1, self.widths[0], self.widths[1], False, 2, nn.ReLU),
self._make_stage(2, self.widths[1], self.widths[2], True, 2, nn.ReLU),
self._make_stage(1, self.widths[2], self.widths[3], False, 2, h_swish),
self._make_stage(1, self.widths[3], self.widths[4], True, 1, h_swish),
self._make_stage(1, self.widths[4], self.widths[5], True, 2, h_swish),
]
# Head
blocks += [
ConvBNReLU(self.widths[5], self.widths[6], 1, 1, activation_layer=h_swish),
nn.AdaptiveAvgPool2d(1),
ConvBNReLU(self.widths[6], self.widths[7], 1, 1, norm_layer=nn.Identity, activation_layer=h_swish),
]
self.blocks = nn.Sequential(*blocks)
self.classifier = nn.Sequential(
nn.Dropout(dropout_rate),
nn.Linear(self.widths[7], num_labels),
)
reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
def forward(self, x):
x = self.blocks(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def _make_stage(self, stage_idx, inp, oup, se, stride, act):
# initialize them first because they are related to layer_count.
exp, ks, se_blocks = [], [], []
for _ in range(4):
exp.append(nn.ValueChoice(list(self.expand_ratios), label=f'exp_{self.layer_count}'))
ks.append(nn.ValueChoice([3, 5, 7], label=f'ks_{self.layer_count}'))
if se:
# if SE is true, assign a layer choice to SE
se_blocks.append(
lambda hidden_ch: nn.LayerChoice([nn.Identity(), SELayer(hidden_ch)], label=f'se_{self.layer_count}')
)
else:
se_blocks.append(None)
self.layer_count += 1
blocks = [
# stride = 2
InvertedResidual(inp, oup, exp[0], ks[0],
stride, squeeze_and_excite=se_blocks[0], activation_layer=act),
# stride = 1, residual connection should be automatically enabled
InvertedResidual(oup, oup, exp[1], ks[1], squeeze_and_excite=se_blocks[1], activation_layer=act),
InvertedResidual(oup, oup, exp[2], ks[2], squeeze_and_excite=se_blocks[2], activation_layer=act),
InvertedResidual(oup, oup, exp[3], ks[3], squeeze_and_excite=se_blocks[3], activation_layer=act)
]
# mutable depth
return nn.Repeat(blocks, depth=(1, 4), label=f'depth_{stage_idx}')
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import math
import torch.nn as nn
from nni.retiarii import model_wrapper
from nni.retiarii.nn.pytorch import NasBench101Cell
__all__ = ['NasBench101']
def truncated_normal_(tensor, mean=0, std=1):
# https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
size = tensor.shape
tmp = tensor.new_empty(size + (4,)).normal_()
valid = (tmp < 2) & (tmp > -2)
ind = valid.max(-1, keepdim=True)[1]
tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
tensor.data.mul_(std).add_(mean)
class ConvBNReLU(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
super(ConvBNReLU, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv_bn_relu = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
fan_in = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
truncated_normal_(m.weight.data, mean=0., std=math.sqrt(1. / fan_in))
if isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
return self.conv_bn_relu(x)
class Conv3x3BNReLU(ConvBNReLU):
def __init__(self, in_channels, out_channels):
super(Conv3x3BNReLU, self).__init__(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
class Conv1x1BNReLU(ConvBNReLU):
def __init__(self, in_channels, out_channels):
super(Conv1x1BNReLU, self).__init__(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
Projection = Conv1x1BNReLU
@model_wrapper
class NasBench101(nn.Module):
"""The full search space, proposed by `NAS-Bench-101 <http://proceedings.mlr.press/v97/ying19a/ying19a.pdf>`__.
It's simply a stack of :class:`NasBench101Cell`. Operations are conv3x3, conv1x1 and maxpool respectively.
"""
def __init__(self,
stem_out_channels: int = 128,
num_stacks: int = 3,
num_modules_per_stack: int = 3,
max_num_vertices: int = 7,
max_num_edges: int = 9,
num_labels: int = 10,
bn_eps: float = 1e-5,
bn_momentum: float = 0.003):
super().__init__()
op_candidates = {
'conv3x3-bn-relu': lambda num_features: Conv3x3BNReLU(num_features, num_features),
'conv1x1-bn-relu': lambda num_features: Conv1x1BNReLU(num_features, num_features),
'maxpool3x3': lambda num_features: nn.MaxPool2d(3, 1, 1)
}
# initial stem convolution
self.stem_conv = Conv3x3BNReLU(3, stem_out_channels)
layers = []
in_channels = out_channels = stem_out_channels
for stack_num in range(num_stacks):
if stack_num > 0:
downsample = nn.MaxPool2d(kernel_size=2, stride=2)
layers.append(downsample)
out_channels *= 2
for _ in range(num_modules_per_stack):
cell = NasBench101Cell(op_candidates, in_channels, out_channels,
lambda cin, cout: Projection(cin, cout),
max_num_vertices, max_num_edges, label='cell')
layers.append(cell)
in_channels = out_channels
self.features = nn.ModuleList(layers)
self.gap = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Linear(out_channels, num_labels)
for module in self.modules():
if isinstance(module, nn.BatchNorm2d):
module.eps = bn_eps
module.momentum = bn_momentum
def forward(self, x):
bs = x.size(0)
out = self.stem_conv(x)
for layer in self.features:
out = layer(out)
out = self.gap(out).view(bs, -1)
out = self.classifier(out)
return out
def reset_parameters(self):
for module in self.modules():
if isinstance(module, nn.BatchNorm2d):
module.eps = self.config.bn_eps
module.momentum = self.config.bn_momentum
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
from nni.retiarii import model_wrapper
from nni.retiarii.nn.pytorch import NasBench201Cell
__all__ = ['NasBench201']
OPS_WITH_STRIDE = {
'none': lambda C_in, C_out, stride: Zero(C_in, C_out, stride),
'avg_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'avg'),
'max_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'max'),
'conv_3x3': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (3, 3), (stride, stride), (1, 1), (1, 1)),
'conv_1x1': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (1, 1), (stride, stride), (0, 0), (1, 1)),
'skip_connect': lambda C_in, C_out, stride: nn.Identity() if stride == 1 and C_in == C_out
else FactorizedReduce(C_in, C_out, stride),
}
PRIMITIVES = ['none', 'skip_connect', 'conv_1x1', 'conv_3x3', 'avg_pool_3x3']
class ReLUConvBN(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
super(ReLUConvBN, self).__init__()
self.op = nn.Sequential(
nn.ReLU(inplace=False),
nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
padding=padding, dilation=dilation, bias=False),
nn.BatchNorm2d(C_out)
)
def forward(self, x):
return self.op(x)
class SepConv(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
super(SepConv, self).__init__()
self.op = nn.Sequential(
nn.ReLU(inplace=False),
nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=C_in, bias=False),
nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
nn.BatchNorm2d(C_out),
)
def forward(self, x):
return self.op(x)
class Pooling(nn.Module):
def __init__(self, C_in, C_out, stride, mode):
super(Pooling, self).__init__()
if C_in == C_out:
self.preprocess = None
else:
self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 1)
if mode == 'avg':
self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
elif mode == 'max':
self.op = nn.MaxPool2d(3, stride=stride, padding=1)
else:
raise ValueError('Invalid mode={:} in Pooling'.format(mode))
def forward(self, x):
if self.preprocess:
x = self.preprocess(x)
return self.op(x)
class Zero(nn.Module):
def __init__(self, C_in, C_out, stride):
super(Zero, self).__init__()
self.C_in = C_in
self.C_out = C_out
self.stride = stride
self.is_zero = True
def forward(self, x):
if self.C_in == self.C_out:
if self.stride == 1:
return x.mul(0.)
else:
return x[:, :, ::self.stride, ::self.stride].mul(0.)
else:
shape = list(x.shape)
shape[1] = self.C_out
zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
return zeros
class FactorizedReduce(nn.Module):
def __init__(self, C_in, C_out, stride):
super(FactorizedReduce, self).__init__()
self.stride = stride
self.C_in = C_in
self.C_out = C_out
self.relu = nn.ReLU(inplace=False)
if stride == 2:
C_outs = [C_out // 2, C_out - C_out // 2]
self.convs = nn.ModuleList()
for i in range(2):
self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
else:
raise ValueError('Invalid stride : {:}'.format(stride))
self.bn = nn.BatchNorm2d(C_out)
def forward(self, x):
x = self.relu(x)
y = self.pad(x)
out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out
class ResNetBasicblock(nn.Module):
def __init__(self, inplanes, planes, stride):
super(ResNetBasicblock, self).__init__()
assert stride == 1 or stride == 2, 'invalid stride {:}'.format(stride)
self.conv_a = ReLUConvBN(inplanes, planes, 3, stride, 1, 1)
self.conv_b = ReLUConvBN(planes, planes, 3, 1, 1, 1)
if stride == 2:
self.downsample = nn.Sequential(
nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False))
elif inplanes != planes:
self.downsample = ReLUConvBN(inplanes, planes, 1, 1, 0, 1)
else:
self.downsample = None
self.in_dim = inplanes
self.out_dim = planes
self.stride = stride
self.num_conv = 2
def forward(self, inputs):
basicblock = self.conv_a(inputs)
basicblock = self.conv_b(basicblock)
if self.downsample is not None:
inputs = self.downsample(inputs) # residual
return inputs + basicblock
@model_wrapper
class NasBench201(nn.Module):
"""The full search space proposed by `NAS-Bench-201 <https://arxiv.org/abs/2001.00326>`__.
It's a stack of :class:`NasBench201Cell`.
"""
def __init__(self,
stem_out_channels: int = 16,
num_modules_per_stack: int = 5,
num_labels: int = 10):
super().__init__()
self.channels = C = stem_out_channels
self.num_modules = N = num_modules_per_stack
self.num_labels = num_labels
self.stem = nn.Sequential(
nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(C)
)
layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N
C_prev = C
self.cells = nn.ModuleList()
for C_curr, reduction in zip(layer_channels, layer_reductions):
if reduction:
cell = ResNetBasicblock(C_prev, C_curr, 2)
else:
cell = NasBench201Cell({prim: lambda C_in, C_out: OPS_WITH_STRIDE[prim](C_in, C_out, 1) for prim in PRIMITIVES},
C_prev, C_curr, label='cell')
self.cells.append(cell)
C_prev = C_curr
self.lastact = nn.Sequential(
nn.BatchNorm2d(C_prev),
nn.ReLU(inplace=True)
)
self.global_pooling = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Linear(C_prev, self.num_labels)
def forward(self, inputs):
feature = self.stem(inputs)
for cell in self.cells:
feature = cell(feature)
out = self.lastact(feature)
out = self.global_pooling(out)
out = out.view(out.size(0), -1)
logits = self.classifier(out)
return logits
This diff is collapsed.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import math
from typing import Optional, Callable, List, Tuple
import torch
import nni.retiarii.nn.pytorch as nn
from nni.retiarii import model_wrapper
def make_divisible(v, divisor, min_val=None):
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by 8
It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
"""
if min_val is None:
min_val = divisor
# This should work for both value choices and constants.
new_v = nn.ValueChoice.max(min_val, round(v + divisor // 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
return nn.ValueChoice.condition(new_v < 0.9 * v, new_v + divisor, new_v)
class ConvBNReLU(nn.Sequential):
"""
The template for a conv-bn-relu block.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
groups: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
activation_layer: Optional[Callable[..., nn.Module]] = None,
dilation: int = 1,
) -> None:
padding = (kernel_size - 1) // 2 * dilation
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if activation_layer is None:
activation_layer = nn.ReLU6
super().__init__(
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation=dilation, groups=groups,
bias=False),
norm_layer(out_channels),
activation_layer(inplace=True)
)
self.out_channels = out_channels
class SeparableConv(nn.Sequential):
"""
In the original MobileNetV2 implementation, this is InvertedResidual when expand ratio = 1.
Residual connection is added if input and output shape are the same.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
activation_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
super().__init__(
# dw
ConvBNReLU(in_channels, in_channels, stride=stride, kernel_size=kernel_size, groups=in_channels,
norm_layer=norm_layer, activation_layer=activation_layer),
# pw-linear
ConvBNReLU(in_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
)
self.residual_connection = stride == 1 and in_channels == out_channels
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.residual_connection:
return x + super().forward(x)
else:
return super().forward(x)
class InvertedResidual(nn.Sequential):
"""
An Inverted Residual Block, sometimes called an MBConv Block, is a type of residual block used for image models
that uses an inverted structure for efficiency reasons.
It was originally proposed for the `MobileNetV2 <https://arxiv.org/abs/1801.04381>`__ CNN architecture.
It has since been reused for several mobile-optimized CNNs.
It follows a narrow -> wide -> narrow approach, hence the inversion.
It first widens with a 1x1 convolution, then uses a 3x3 depthwise convolution (which greatly reduces the number of parameters),
then a 1x1 convolution is used to reduce the number of channels so input and output can be added.
Follow implementation of:
https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/rematlib/mobile_model_v3.py#L453
"""
def __init__(
self,
in_channels: int,
out_channels: int,
expand_ratio: int,
kernel_size: int = 3,
stride: int = 1,
squeeze_and_excite: Optional[Callable[[int], nn.Module]] = None,
norm_layer: Optional[Callable[..., nn.Module]] = None,
activation_layer: Optional[Callable[..., nn.Module]] = None,
) -> None:
super().__init__()
self.stride = stride
self.out_channels = out_channels
assert stride in [1, 2]
hidden_ch = nn.ValueChoice.to_int(round(in_channels * expand_ratio))
# FIXME: check whether this equal works
# Residual connection is added here stride = 1 and input channels and output channels are the same.
self.residual_connection = stride == 1 and in_channels == out_channels
layers: List[nn.Module] = [
# point-wise convolution
# NOTE: some paper omit this point-wise convolution when stride = 1.
# In our implementation, if this pw convolution is intended to be omitted,
# please use SepConv instead.
ConvBNReLU(in_channels, hidden_ch, kernel_size=1,
norm_layer=norm_layer, activation_layer=activation_layer),
# depth-wise
ConvBNReLU(hidden_ch, hidden_ch, stride=stride, kernel_size=kernel_size, groups=hidden_ch,
norm_layer=norm_layer, activation_layer=activation_layer)
]
if squeeze_and_excite:
layers.append(squeeze_and_excite(hidden_ch))
layers += [
# pw-linear
ConvBNReLU(hidden_ch, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
]
super().__init__(*layers)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.residual_connection:
return x + super().forward(x)
else:
return super().forward(x)
def inverted_residual_choice_builder(
expand_ratios: List[int],
kernel_sizes: List[int],
downsample: bool,
stage_input_width: int,
stage_output_width: int,
label: str
):
def builder(index):
stride = 1
inp = stage_output_width
if index == 0:
# first layer in stage
# do downsample and width reshape
inp = stage_input_width
if downsample:
stride = 2
oup = stage_output_width
op_choices = {}
for exp_ratio in expand_ratios:
for kernel_size in kernel_sizes:
op_choices[f'k{kernel_size}e{exp_ratio}'] = InvertedResidual(inp, oup, exp_ratio, kernel_size, stride)
# It can be implemented with ValueChoice, but we use LayerChoice here
# to be aligned with the intention of the original ProxylessNAS.
return nn.LayerChoice(op_choices, label=f'{label}_i{index}')
return builder
@model_wrapper
class ProxylessNAS(nn.Module):
"""
The search space proposed by `ProxylessNAS <https://arxiv.org/abs/1812.00332>`__.
Following the official implementation, the inverted residual with kernel size / expand ratio variations in each layer
is implemented with a :class:`nn.LayerChoice` with all-combination candidates. That means,
when used in weight sharing, these candidates will be treated as separate layers, and won't be fine-grained shared.
We note that ``MobileNetV3Space`` is different in this perspective.
"""
def __init__(self, num_labels: int = 1000,
base_widths: Tuple[int, ...] = (32, 16, 32, 40, 80, 96, 192, 320, 1280),
dropout_rate: float = 0.,
width_mult: float = 1.0,
bn_eps: float = 1e-3,
bn_momentum: float = 0.1):
super().__init__()
assert len(base_widths) == 9
# include the last stage info widths here
widths = [make_divisible(width * width_mult, 8) for width in base_widths]
downsamples = [True, False, True, True, True, False, True, False]
self.num_labels = num_labels
self.dropout_rate = dropout_rate
self.bn_eps = bn_eps
self.bn_momentum = bn_momentum
self.first_conv = ConvBNReLU(3, widths[0], stride=2, norm_layer=nn.BatchNorm2d)
blocks = [
# first stage is fixed
SeparableConv(widths[0], widths[1], kernel_size=3, stride=1)
]
# https://github.com/ultmaster/AceNAS/blob/46c8895fd8a05ffbc61a6b44f1e813f64b4f66b7/searchspace/proxylessnas/__init__.py#L21
for stage in range(2, 8):
# Rather than returning a fixed module here,
# we return a builder that dynamically creates module for different `repeat_idx`.
builder = inverted_residual_choice_builder(
[3, 6], [3, 5, 7], downsamples[stage], widths[stage - 1], widths[stage], f's{stage}')
if stage < 6:
blocks.append(nn.Repeat(builder, (1, 4), label=f's{stage}_depth'))
else:
# No mutation for depth in the last stage.
# Directly call builder to initiate one block
blocks.append(builder(0))
self.blocks = nn.Sequential(*blocks)
# final layers
self.feature_mix_layer = ConvBNReLU(widths[7], widths[8], kernel_size=1, norm_layer=nn.BatchNorm2d)
self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
self.dropout_layer = nn.Dropout(dropout_rate)
self.classifier = nn.Linear(widths[-1], num_labels)
reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
def forward(self, x):
x = self.first_conv(x)
x = self.blocks(x)
x = self.feature_mix_layer(x)
x = self.global_avg_pooling(x)
x = x.view(x.size(0), -1) # flatten
x = self.dropout_layer(x)
x = self.classifier(x)
return x
def no_weight_decay(self):
# this is useful for timm optimizer
# no regularizer to linear layer
if hasattr(self, 'classifier'):
return {'classifier.weight', 'classifier.bias'}
return set()
def reset_parameters(model, model_init='he_fout', init_div_groups=False,
bn_momentum=0.1, bn_eps=1e-5):
for m in model.modules():
if isinstance(m, nn.Conv2d):
if model_init == 'he_fout':
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
if init_div_groups:
n /= m.groups
m.weight.data.normal_(0, math.sqrt(2. / n))
elif model_init == 'he_fin':
n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
if init_div_groups:
n /= m.groups
m.weight.data.normal_(0, math.sqrt(2. / n))
else:
raise NotImplementedError
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
m.momentum = bn_momentum
m.eps = bn_eps
elif isinstance(m, nn.Linear):
m.weight.data.normal_(0, 0.01)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm1d):
m.weight.data.fill_(1)
m.bias.data.zero_()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import nni.retiarii.nn.pytorch as nn
from nni.retiarii import model_wrapper
class ShuffleNetBlock(nn.Module):
"""
Describe the basic building block of shuffle net, as described in
`ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices <https://arxiv.org/pdf/1707.01083.pdf>`__.
When stride = 1, the block expects an input with ``2 * input channels``. Otherwise input channels.
"""
def __init__(self, in_channels: int, out_channels: int, mid_channels: int, *,
kernel_size: int, stride: int, sequence: str = "pdp", affine: bool = True):
super().__init__()
assert stride in [1, 2]
assert kernel_size in [3, 5, 7]
self.channels = in_channels // 2 if stride == 1 else in_channels
self.in_channels = in_channels
self.out_channels = out_channels
self.mid_channels = mid_channels
self.kernel_size = kernel_size
self.stride = stride
self.pad = kernel_size // 2
self.oup_main = out_channels - self.channels
self.affine = affine
assert self.oup_main > 0
self.branch_main = nn.Sequential(*self._decode_point_depth_conv(sequence))
if stride == 2:
self.branch_proj = nn.Sequential(
# dw
nn.Conv2d(self.channels, self.channels, kernel_size, stride, self.pad,
groups=self.channels, bias=False),
nn.BatchNorm2d(self.channels, affine=affine),
# pw-linear
nn.Conv2d(self.channels, self.channels, 1, 1, 0, bias=False),
nn.BatchNorm2d(self.channels, affine=affine),
nn.ReLU(inplace=True)
)
else:
# empty block to be compatible with torchscript
self.branch_proj = nn.Sequential()
def forward(self, x):
if self.stride == 2:
x_proj, x = self.branch_proj(x), x
else:
x_proj, x = self._channel_shuffle(x)
return torch.cat((x_proj, self.branch_main(x)), 1)
def _decode_point_depth_conv(self, sequence):
result = []
first_depth = first_point = True
pc = c = self.channels
for i, token in enumerate(sequence):
# compute output channels of this conv
if i + 1 == len(sequence):
assert token == "p", "Last conv must be point-wise conv."
c = self.oup_main
elif token == "p" and first_point:
c = self.mid_channels
if token == "d":
# depth-wise conv
if isinstance(pc, int) and isinstance(c, int):
# check can only be done for static channels
assert pc == c, "Depth-wise conv must not change channels."
result.append(nn.Conv2d(pc, c, self.kernel_size, self.stride if first_depth else 1, self.pad,
groups=c, bias=False))
result.append(nn.BatchNorm2d(c, affine=self.affine))
first_depth = False
elif token == "p":
# point-wise conv
result.append(nn.Conv2d(pc, c, 1, 1, 0, bias=False))
result.append(nn.BatchNorm2d(c, affine=self.affine))
result.append(nn.ReLU(inplace=True))
first_point = False
else:
raise ValueError("Conv sequence must be d and p.")
pc = c
return result
def _channel_shuffle(self, x):
bs, num_channels, height, width = x.size()
# NOTE: this line is commented for torchscript
# assert (num_channels % 4 == 0)
x = x.reshape(bs * num_channels // 2, 2, height * width)
x = x.permute(1, 0, 2)
x = x.reshape(2, -1, num_channels // 2, height, width)
return x[0], x[1]
class ShuffleXceptionBlock(ShuffleNetBlock):
"""
The ``choice_x`` version of shuffle net block, described in
`Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
"""
def __init__(self, in_channels: int, out_channels: int, mid_channels: int, *, stride: int, affine: bool = True):
super().__init__(in_channels, out_channels, mid_channels,
kernel_size=3, stride=stride, sequence="dpdpdp", affine=affine)
@model_wrapper
class ShuffleNetSpace(nn.Module):
"""
The search space proposed in `Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
The basic building block design is inspired by a state-of-the-art manually-designed network --
`ShuffleNetV2 <https://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html>`__.
There are 20 choice blocks in total. Each choice block has 4 candidates, namely ``choice 3``, ``choice 5``,
``choice_7`` and ``choice_x`` respectively. They differ in kernel sizes and the number of depthwise convolutions.
The size of the search space is :math:`4^{20}`.
Parameters
----------
num_labels : int
Number of classes for the classification head. Default: 1000.
channel_search : bool
If true, for each building block, the number of ``mid_channels``
(output channels of the first 1x1 conv in each building block) varies from 0.2x to 1.6x (quantized to multiple of 0.2).
Here, "k-x" means k times the number of default channels.
Otherwise, 1.0x is used by default. Default: false.
affine : bool
Apply affine to all batch norm. Default: false.
"""
def __init__(self,
num_labels: int = 1000,
channel_search: bool = False,
affine: bool = False):
super().__init__()
self.num_labels = num_labels
self.channel_search = channel_search
self.affine = affine
# the block number in each stage. 4 stages in total. 20 blocks in total.
self.stage_repeats = [4, 4, 8, 4]
# output channels for all stages, including the very first layer and the very last layer
self.stage_out_channels = [-1, 16, 64, 160, 320, 640, 1024]
# building first layer
out_channels = self.stage_out_channels[1]
self.first_conv = nn.Sequential(
nn.Conv2d(3, out_channels, 3, 2, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
)
self.features = []
global_block_idx = 0
for stage_idx, num_repeat in enumerate(self.stage_repeats):
for block_idx in range(num_repeat):
# count global index to give names to choices
global_block_idx += 1
# get ready for input and output
in_channels = out_channels
out_channels = self.stage_out_channels[stage_idx + 2]
stride = 2 if block_idx == 0 else 1
# mid channels can be searched
base_mid_channels = out_channels // 2
if self.channel_search:
k_choice_list = [int(base_mid_channels * (.2 * k)) for k in range(1, 9)]
mid_channels = nn.ValueChoice(k_choice_list, label=f'channel_{global_block_idx}')
else:
mid_channels = int(base_mid_channels)
choice_block = nn.LayerChoice([
ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=3, stride=stride, affine=affine),
ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=5, stride=stride, affine=affine),
ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=7, stride=stride, affine=affine),
ShuffleXceptionBlock(in_channels, out_channels, mid_channels=mid_channels, stride=stride, affine=affine)
], label=f'layer_{global_block_idx}')
self.features.append(choice_block)
self.features = nn.Sequential(*self.features)
# final layers
last_conv_channels = self.stage_out_channels[-1]
self.conv_last = nn.Sequential(
nn.Conv2d(out_channels, last_conv_channels, 1, 1, 0, bias=False),
nn.BatchNorm2d(last_conv_channels, affine=affine),
nn.ReLU(inplace=True),
)
self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Sequential(
nn.Linear(last_conv_channels, num_labels, bias=False),
)
self._initialize_weights()
def forward(self, x):
x = self.first_conv(x)
x = self.features(x)
x = self.conv_last(x)
x = self.globalpool(x)
x = self.dropout(x)
x = x.contiguous().view(-1, self.stage_out_channels[-1])
x = self.classifier(x)
return x
def _initialize_weights(self):
for name, m in self.named_modules():
if isinstance(m, nn.Conv2d):
if 'first' in name:
torch.nn.init.normal_(m.weight, 0, 0.01)
else:
torch.nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
if m.weight is not None:
torch.nn.init.constant_(m.weight, 1)
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0.0001)
torch.nn.init.constant_(m.running_mean, 0)
elif isinstance(m, nn.BatchNorm1d):
if m.weight is not None:
torch.nn.init.constant_(m.weight, 1)
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0.0001)
torch.nn.init.constant_(m.running_mean, 0)
elif isinstance(m, nn.Linear):
torch.nn.init.normal_(m.weight, 0, 0.01)
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
......@@ -388,7 +388,7 @@ def _valuechoice_staticmethod_helper(orig_func):
return orig_func
class ValueChoiceX(Translatable):
class ValueChoiceX(Translatable, nn.Module):
"""Internal API. Implementation note:
The transformed (X) version of value choice.
......@@ -404,6 +404,8 @@ class ValueChoiceX(Translatable):
2. For graph-engine, it uses evaluate to calculate the result.
Potentially, we have to implement the evaluation logic in oneshot algorithms. I believe we can postpone the discussion till then.
This class is implemented as a ``nn.Module`` so that it can be scanned by python engine / torchscript.
"""
def __init__(self, function: Callable[..., Any], repr_template: str, arguments: List[Any], dry_run: bool = True):
......@@ -424,6 +426,9 @@ class ValueChoiceX(Translatable):
# for sanity check
self.dry_run()
def forward(self) -> None:
raise RuntimeError('You should never call forward of the composition of a value-choice.')
def inner_choices(self) -> Iterable['ValueChoice']:
"""
Return an iterable of all leaf value choices.
......
......@@ -93,6 +93,8 @@ class Repeat(Mutable):
depth: Union[int, Tuple[int, int]], *, label: Optional[str] = None):
super().__init__()
self._label = None # by default, no label
if isinstance(depth, ValueChoiceX):
if label is not None:
warnings.warn(
......@@ -103,10 +105,16 @@ class Repeat(Mutable):
all_values = list(self.depth_choice.all_options())
self.min_depth = min(all_values)
self.max_depth = max(all_values)
if isinstance(depth, ValueChoice):
self._label = depth.label # if a leaf node
elif isinstance(depth, tuple):
self.min_depth = depth if isinstance(depth, int) else depth[0]
self.max_depth = depth if isinstance(depth, int) else depth[1]
self.depth_choice = ValueChoice(list(range(self.min_depth, self.max_depth + 1)), label=label)
self._label = self.depth_choice.label
elif isinstance(depth, int):
self.min_depth = self.max_depth = depth
self.depth_choice = depth
......@@ -116,8 +124,8 @@ class Repeat(Mutable):
self.blocks = nn.ModuleList(self._replicate_and_instantiate(blocks, self.max_depth))
@property
def label(self):
return self.depth_choice.label
def label(self) -> Optional[str]:
return self._label
def forward(self, x):
for block in self.blocks:
......@@ -142,6 +150,9 @@ class Repeat(Mutable):
# shortcut for blocks[index]
return self.blocks[index]
def __len__(self):
return self.max_depth
class NasBench201Cell(nn.Module):
"""
......
......@@ -311,9 +311,10 @@ def extract_mutation_from_pt_module(pytorch_model: nn.Module) -> Tuple[Model, Op
node = graph.add_node(name, 'InputChoice',
{'n_candidates': module.n_candidates, 'n_chosen': module.n_chosen})
node.label = module.label
if isinstance(module, ValueChoice):
node = graph.add_node(name, 'ValueChoice', {'candidates': module.candidates})
node.label = module.label
if isinstance(module, ValueChoiceX):
for i, choice in enumerate(module.inner_choices()):
node = graph.add_node(f'{name}.{i}', 'ValueChoice', {'candidates': choice.candidates})
node.label = choice.label
if isinstance(module, NasBench101Cell):
node = graph.add_node(name, 'NasBench101Cell', {
'max_num_edges': module.max_num_edges
......
......@@ -683,6 +683,27 @@ class GraphIR(unittest.TestCase):
new_model = _apply_all_mutators(model, mutators, samplers)
self.assertTrue((self._get_converted_pytorch_model(new_model)(torch.zeros(1, 16)) == target).all())
def test_repeat_valuechoicex(self):
class AddOne(nn.Module):
def forward(self, x):
return x + 1
@model_wrapper
class Net(nn.Module):
def __init__(self):
super().__init__()
self.block = nn.Repeat(AddOne(), nn.ValueChoice([0, 2, 4]) + 1)
def forward(self, x):
return self.block(x)
model, mutators = self._get_model_with_mutators(Net())
self.assertEqual(len(mutators), 1 + self.repeat_incr + self.value_choice_incr)
samplers = [EnumerateSampler() for _ in range(len(mutators))]
for target in [1, 3, 5]:
new_model = _apply_all_mutators(model, mutators, samplers)
self.assertTrue((self._get_converted_pytorch_model(new_model)(torch.zeros(1, 16)) == target).all())
def test_repeat_weight_inheritance(self):
@model_wrapper
class Net(nn.Module):
......
"""Currently, this is only a sanity-check (runnable) of spaces provided in hub."""
import random
from torchvision import transforms
from torchvision.datasets import FakeData
import pytest
import pytorch_lightning
import nni
import nni.runtime.platform.test
import nni.retiarii.evaluator.pytorch.lightning as pl
import nni.retiarii.hub.pytorch as searchspace
from nni.retiarii.utils import ContextStack
from nni.retiarii.execution.utils import _unpack_if_only_one
from nni.retiarii.mutator import InvalidMutation, Sampler
from nni.retiarii.nn.pytorch.mutator import extract_mutation_from_pt_module
pytestmark = pytest.mark.skipif(pytorch_lightning.__version__ < '1.0', reason='Incompatible APIs.')
def _reset():
# this is to not affect other tests in sdk
nni.trial._intermediate_seq = 0
nni.trial._params = {'foo': 'bar', 'parameter_id': 0}
nni.runtime.platform.test._last_metric = None
class RandomSampler(Sampler):
def __init__(self):
self.counter = 0
def choice(self, candidates, *args, **kwargs):
self.counter += 1
return random.choice(candidates)
def try_mutation_until_success(base_model, mutators, retry):
if not retry:
raise ValueError('Retry exhausted.')
try:
model = base_model
for mutator in mutators:
model = mutator.bind_sampler(RandomSampler()).apply(model)
return model
except InvalidMutation:
return try_mutation_until_success(base_model, mutators, retry - 1)
def _test_searchspace_on_dataset(searchspace, dataset='cifar10', arch=None):
_reset()
# dataset supports cifar10 and imagenet
model, mutators = extract_mutation_from_pt_module(searchspace)
if arch is None:
model = try_mutation_until_success(model, mutators, 10)
arch = {mut.mutator.label: _unpack_if_only_one(mut.samples) for mut in model.history}
print('Selected model:', arch)
with ContextStack('fixed', arch):
model = model.python_class(**model.python_init_params)
if dataset == 'cifar10':
train_data = FakeData(size=200, image_size=(3, 32, 32), num_classes=10, transform=transforms.ToTensor())
valid_data = FakeData(size=200, image_size=(3, 32, 32), num_classes=10, transform=transforms.ToTensor())
elif dataset == 'imagenet':
train_data = FakeData(size=200, image_size=(3, 224, 224), num_classes=1000, transform=transforms.ToTensor())
valid_data = FakeData(size=200, image_size=(3, 224, 224), num_classes=1000, transform=transforms.ToTensor())
train_dataloader = pl.DataLoader(train_data, batch_size=4, shuffle=True)
valid_dataloader = pl.DataLoader(valid_data, batch_size=6)
evaluator = pl.Classification(
train_dataloader=train_dataloader,
val_dataloaders=valid_dataloader,
export_onnx=False,
max_epochs=1,
limit_train_batches=2,
limit_val_batches=3,
)
evaluator.fit(model)
# cleanup to avoid affecting later test cases
_reset()
def test_nasbench101():
ss = searchspace.NasBench101()
_test_searchspace_on_dataset(ss)
def test_nasbench201():
ss = searchspace.NasBench101()
_test_searchspace_on_dataset(ss)
def test_nasnet():
_test_searchspace_on_dataset(searchspace.NASNet())
_test_searchspace_on_dataset(searchspace.ENAS())
_test_searchspace_on_dataset(searchspace.AmoebaNet())
_test_searchspace_on_dataset(searchspace.PNAS())
_test_searchspace_on_dataset(searchspace.DARTS())
def test_nasnet_corner_case():
# The case is that output channel of reduce cell and normal cell are different
# CellPreprocessor needs to know whether its predecessors are normal cell / reduction cell
arch = {
"width": 32,
"depth": 8,
"normal/op_2_0": "max_pool_7x7",
"normal/op_2_1": "conv_1x1",
"normal/op_3_0": "sep_conv_5x5",
"normal/op_3_1": "max_pool_7x7",
"normal/op_4_0": "sep_conv_5x5",
"normal/op_4_1": "conv_1x1",
"normal/op_5_0": "max_pool_3x3",
"normal/op_5_1": "sep_conv_5x5",
"normal/op_6_0": "max_pool_7x7",
"normal/op_6_1": "sep_conv_5x5",
"normal/input_2_0": 0,
"normal/input_2_1": 0,
"normal/input_3_0": 0,
"normal/input_3_1": 1,
"normal/input_4_0": 1,
"normal/input_4_1": 2,
"normal/input_5_0": 0,
"normal/input_5_1": 1,
"normal/input_6_0": 0,
"normal/input_6_1": 2,
"reduce/op_2_0": "dil_conv_3x3",
"reduce/op_2_1": "max_pool_7x7",
"reduce/op_3_0": "dil_conv_3x3",
"reduce/op_3_1": "dil_conv_3x3",
"reduce/op_4_0": "conv_7x1_1x7",
"reduce/op_4_1": "conv_7x1_1x7",
"reduce/op_5_0": "max_pool_3x3",
"reduce/op_5_1": "conv_1x1",
"reduce/op_6_0": "sep_conv_7x7",
"reduce/op_6_1": "sep_conv_3x3",
"reduce/input_2_0": 1,
"reduce/input_2_1": 1,
"reduce/input_3_0": 0,
"reduce/input_3_1": 1,
"reduce/input_4_0": 2,
"reduce/input_4_1": 1,
"reduce/input_5_0": 0,
"reduce/input_5_1": 4,
"reduce/input_6_0": 3,
"reduce/input_6_1": 3,
}
_test_searchspace_on_dataset(searchspace.NASNet(), arch=arch)
def test_nasnet_fixwd():
# minimum
ss = searchspace.DARTS(width=16, num_cells=4)
_test_searchspace_on_dataset(ss)
# medium
ss = searchspace.NASNet(width=16, num_cells=12)
_test_searchspace_on_dataset(ss)
def test_nasnet_imagenet():
ss = searchspace.ENAS(dataset='imagenet')
_test_searchspace_on_dataset(ss, dataset='imagenet')
ss = searchspace.PNAS(dataset='imagenet')
_test_searchspace_on_dataset(ss, dataset='imagenet')
def test_proxylessnas():
ss = searchspace.ProxylessNAS()
_test_searchspace_on_dataset(ss, dataset='imagenet')
def test_mobilenetv3():
ss = searchspace.MobileNetV3Space()
_test_searchspace_on_dataset(ss, dataset='imagenet')
def test_shufflenet():
ss = searchspace.ShuffleNetSpace()
_test_searchspace_on_dataset(ss, dataset='imagenet')
ss = searchspace.ShuffleNetSpace(channel_search=True)
_test_searchspace_on_dataset(ss, dataset='imagenet')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment