mobilenet_v3.py

# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Mobilenet V3 conv defs and helper functions."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy
import functools
import numpy as np

import tensorflow as tf
from tensorflow.contrib import slim as contrib_slim

from nets.mobilenet import conv_blocks as ops
from nets.mobilenet import mobilenet as lib

slim = contrib_slim
op = lib.op
expand_input = ops.expand_input_by_factor

# Squeeze Excite with all parameters filled-in, we use hard-sigmoid
# for gating function and relu for inner activation function.
squeeze_excite = functools.partial(
    ops.squeeze_excite, squeeze_factor=4,
    inner_activation_fn=tf.nn.relu,
    gating_fn=lambda x: tf.nn.relu6(x+3)*0.16667)

# Wrap squeeze excite op as expansion_transform that takes
# both expansion and input tensor.
_se4 = lambda expansion_tensor, input_tensor: squeeze_excite(expansion_tensor)


def hard_swish(x):
  with tf.compat.v1.name_scope('hard_swish'):
    return x * tf.nn.relu6(x + np.float32(3)) * np.float32(1. / 6.)


def reduce_to_1x1(input_tensor, default_size=7, **kwargs):
  h, w = input_tensor.shape.as_list()[1:3]
  if h is not None and w == h:
    k = [h, h]
  else:
    k = [default_size, default_size]
  return slim.avg_pool2d(input_tensor, kernel_size=k, **kwargs)


def mbv3_op(ef, n, k, s=1, act=tf.nn.relu, se=None, **kwargs):
  """Defines a single Mobilenet V3 convolution block.

  Args:
    ef: expansion factor
    n: number of output channels
    k: stride of depthwise
    s: stride
    act: activation function in inner layers
    se: squeeze excite function.
    **kwargs: passed to expanded_conv

  Returns:
    An object (lib._Op) for inserting in conv_def, representing this operation.
  """
  return op(
      ops.expanded_conv,
      expansion_size=expand_input(ef),
      kernel_size=(k, k),
      stride=s,
      num_outputs=n,
      inner_activation_fn=act,
      expansion_transform=se,
      **kwargs)


def mbv3_fused(ef, n, k, s=1, **kwargs):
  """Defines a single Mobilenet V3 convolution block.

  Args:
    ef: expansion factor
    n: number of output channels
    k: stride of depthwise
    s: stride
    **kwargs: will be passed to mbv3_op

  Returns:
    An object (lib._Op) for inserting in conv_def, representing this operation.
  """
  expansion_fn = functools.partial(slim.conv2d, kernel_size=k, stride=s)
  return mbv3_op(
      ef,
      n,
      k=1,
      s=s,
      depthwise_location=None,
      expansion_fn=expansion_fn,
      **kwargs)


mbv3_op_se = functools.partial(mbv3_op, se=_se4)


DEFAULTS = {
    (ops.expanded_conv,):
        dict(
            normalizer_fn=slim.batch_norm,
            residual=True),
    (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
        'normalizer_fn': slim.batch_norm,
        'activation_fn': tf.nn.relu,
    },
    (slim.batch_norm,): {
        'center': True,
        'scale': True
    },
}

# Compatible checkpoint: http://mldash/5511169891790690458#scalars
V3_LARGE = dict(
    defaults=dict(DEFAULTS),
    spec=([
        # stage 1
        op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3),
           activation_fn=hard_swish),
        mbv3_op(ef=1, n=16, k=3),
        mbv3_op(ef=4, n=24, k=3, s=2),
        mbv3_op(ef=3, n=24, k=3, s=1),
        mbv3_op_se(ef=3, n=40, k=5, s=2),
        mbv3_op_se(ef=3, n=40, k=5, s=1),
        mbv3_op_se(ef=3, n=40, k=5, s=1),
        mbv3_op(ef=6, n=80, k=3, s=2, act=hard_swish),
        mbv3_op(ef=2.5, n=80, k=3, s=1, act=hard_swish),
        mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish),
        mbv3_op(ef=184/80., n=80, k=3, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=112, k=3, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=160, k=5, s=2, act=hard_swish),
        mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=160, k=5, s=1, act=hard_swish),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960,
           activation_fn=hard_swish),
        op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280,
           normalizer_fn=None, activation_fn=hard_swish)
    ]))

# 72.2% accuracy.
V3_LARGE_MINIMALISTIC = dict(
    defaults=dict(DEFAULTS),
    spec=([
        # stage 1
        op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)),
        mbv3_op(ef=1, n=16, k=3),
        mbv3_op(ef=4, n=24, k=3, s=2),
        mbv3_op(ef=3, n=24, k=3, s=1),
        mbv3_op(ef=3, n=40, k=3, s=2),
        mbv3_op(ef=3, n=40, k=3, s=1),
        mbv3_op(ef=3, n=40, k=3, s=1),
        mbv3_op(ef=6, n=80, k=3, s=2),
        mbv3_op(ef=2.5, n=80, k=3, s=1),
        mbv3_op(ef=184 / 80., n=80, k=3, s=1),
        mbv3_op(ef=184 / 80., n=80, k=3, s=1),
        mbv3_op(ef=6, n=112, k=3, s=1),
        mbv3_op(ef=6, n=112, k=3, s=1),
        mbv3_op(ef=6, n=160, k=3, s=2),
        mbv3_op(ef=6, n=160, k=3, s=1),
        mbv3_op(ef=6, n=160, k=3, s=1),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=960),
        op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
        op(slim.conv2d,
           stride=1,
           kernel_size=[1, 1],
           num_outputs=1280,
           normalizer_fn=None)
    ]))

# Compatible run: http://mldash/2023283040014348118#scalars
V3_SMALL = dict(
    defaults=dict(DEFAULTS),
    spec=([
        # stage 1
        op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3),
           activation_fn=hard_swish),
        mbv3_op_se(ef=1, n=16, k=3, s=2),
        mbv3_op(ef=72./16, n=24, k=3, s=2),
        mbv3_op(ef=(88./24), n=24, k=3, s=1),
        mbv3_op_se(ef=4, n=40, k=5, s=2, act=hard_swish),
        mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=40, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=3, n=48, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=96, k=5, s=2, act=hard_swish),
        mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish),
        mbv3_op_se(ef=6, n=96, k=5, s=1, act=hard_swish),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576,
           activation_fn=hard_swish),
        op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1024,
           normalizer_fn=None, activation_fn=hard_swish)
    ]))

# 62% accuracy.
V3_SMALL_MINIMALISTIC = dict(
    defaults=dict(DEFAULTS),
    spec=([
        # stage 1
        op(slim.conv2d, stride=2, num_outputs=16, kernel_size=(3, 3)),
        mbv3_op(ef=1, n=16, k=3, s=2),
        mbv3_op(ef=72. / 16, n=24, k=3, s=2),
        mbv3_op(ef=(88. / 24), n=24, k=3, s=1),
        mbv3_op(ef=4, n=40, k=3, s=2),
        mbv3_op(ef=6, n=40, k=3, s=1),
        mbv3_op(ef=6, n=40, k=3, s=1),
        mbv3_op(ef=3, n=48, k=3, s=1),
        mbv3_op(ef=3, n=48, k=3, s=1),
        mbv3_op(ef=6, n=96, k=3, s=2),
        mbv3_op(ef=6, n=96, k=3, s=1),
        mbv3_op(ef=6, n=96, k=3, s=1),
        op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=576),
        op(reduce_to_1x1, default_size=7, stride=1, padding='VALID'),
        op(slim.conv2d,
           stride=1,
           kernel_size=[1, 1],
           num_outputs=1024,
           normalizer_fn=None)
    ]))


# EdgeTPU friendly variant of MobilenetV3 that uses fused convolutions
# instead of depthwise in the early layers.
V3_EDGETPU = dict(
    defaults=dict(DEFAULTS),
    spec=[
        op(slim.conv2d, stride=2, num_outputs=32, kernel_size=(3, 3)),
        mbv3_fused(k=3, s=1, ef=1, n=16),
        mbv3_fused(k=3, s=2, ef=8, n=32),
        mbv3_fused(k=3, s=1, ef=4, n=32),
        mbv3_fused(k=3, s=1, ef=4, n=32),
        mbv3_fused(k=3, s=1, ef=4, n=32),
        mbv3_fused(k=3, s=2, ef=8, n=48),
        mbv3_fused(k=3, s=1, ef=4, n=48),
        mbv3_fused(k=3, s=1, ef=4, n=48),
        mbv3_fused(k=3, s=1, ef=4, n=48),
        mbv3_op(k=3, s=2, ef=8, n=96),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=3, s=1, ef=8, n=96, residual=False),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=3, s=1, ef=4, n=96),
        mbv3_op(k=5, s=2, ef=8, n=160),
        mbv3_op(k=5, s=1, ef=4, n=160),
        mbv3_op(k=5, s=1, ef=4, n=160),
        mbv3_op(k=5, s=1, ef=4, n=160),
        mbv3_op(k=3, s=1, ef=8, n=192),
        op(slim.conv2d, stride=1, num_outputs=1280, kernel_size=(1, 1)),
    ])


@slim.add_arg_scope
def mobilenet(input_tensor,
              num_classes=1001,
              depth_multiplier=1.0,
              scope='MobilenetV3',
              conv_defs=None,
              finegrain_classification_mode=False,
              **kwargs):
  """Creates mobilenet V3 network.

  Inference mode is created by default. To create training use training_scope
  below.

  with tf.contrib.slim.arg_scope(mobilenet_v3.training_scope()):
     logits, endpoints = mobilenet_v3.mobilenet(input_tensor)

  Args:
    input_tensor: The input tensor
    num_classes: number of classes
    depth_multiplier: The multiplier applied to scale number of
    channels in each layer.
    scope: Scope of the operator
    conv_defs: Which version to create. Could be large/small or
    any conv_def (see mobilenet_v3.py for examples).
    finegrain_classification_mode: When set to True, the model
    will keep the last layer large even for small multipliers. Following
    https://arxiv.org/abs/1801.04381
    it improves performance for ImageNet-type of problems.
      *Note* ignored if final_endpoint makes the builder exit earlier.
    **kwargs: passed directly to mobilenet.mobilenet:
      prediction_fn- what prediction function to use.
      reuse-: whether to reuse variables (if reuse set to true, scope
      must be given).
  Returns:
    logits/endpoints pair

  Raises:
    ValueError: On invalid arguments
  """
  if conv_defs is None:
    conv_defs = V3_LARGE
  if 'multiplier' in kwargs:
    raise ValueError('mobilenetv2 doesn\'t support generic '
                     'multiplier parameter use "depth_multiplier" instead.')
  if finegrain_classification_mode:
    conv_defs = copy.deepcopy(conv_defs)
    conv_defs['spec'][-1] = conv_defs['spec'][-1]._replace(
        multiplier_func=lambda params, multiplier: params)
  depth_args = {}
  with slim.arg_scope((lib.depth_multiplier,), **depth_args):
    return lib.mobilenet(
        input_tensor,
        num_classes=num_classes,
        conv_defs=conv_defs,
        scope=scope,
        multiplier=depth_multiplier,
        **kwargs)

mobilenet.default_image_size = 224
training_scope = lib.training_scope


@slim.add_arg_scope
def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs):
  """Creates base of the mobilenet (no pooling and no logits) ."""
  return mobilenet(
      input_tensor, depth_multiplier=depth_multiplier, base_only=True, **kwargs)


def wrapped_partial(func, new_defaults=None,
                    **kwargs):
  """Partial function with new default parameters and updated docstring."""
  if not new_defaults:
    new_defaults = {}
  def func_wrapper(*f_args, **f_kwargs):
    new_kwargs = dict(new_defaults)
    new_kwargs.update(f_kwargs)
    return func(*f_args, **new_kwargs)
  functools.update_wrapper(func_wrapper, func)
  partial_func = functools.partial(func_wrapper, **kwargs)
  functools.update_wrapper(partial_func, func)
  return partial_func


large = wrapped_partial(mobilenet, conv_defs=V3_LARGE)
small = wrapped_partial(mobilenet, conv_defs=V3_SMALL)
edge_tpu = wrapped_partial(mobilenet,
                           new_defaults={'scope': 'MobilenetEdgeTPU'},
                           conv_defs=V3_EDGETPU)
edge_tpu_075 = wrapped_partial(
    mobilenet,
    new_defaults={'scope': 'MobilenetEdgeTPU'},
    conv_defs=V3_EDGETPU,
    depth_multiplier=0.75,
    finegrain_classification_mode=True)

# Minimalistic model that does not have Squeeze Excite blocks,
# Hardswish, or 5x5 depthwise convolution.
# This makes the model very friendly for a wide range of hardware
large_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_LARGE_MINIMALISTIC)
small_minimalistic = wrapped_partial(mobilenet, conv_defs=V3_SMALL_MINIMALISTIC)


def _reduce_consecutive_layers(conv_defs, start_id, end_id, multiplier=0.5):
  """Reduce the outputs of consecutive layers with multiplier.

  Args:
    conv_defs: Mobilenet conv_defs.
    start_id: 0-based index of the starting conv_def to be reduced.
    end_id: 0-based index of the last conv_def to be reduced.
    multiplier: The multiplier by which to reduce the conv_defs.

  Returns:
    Mobilenet conv_defs where the output sizes from layers [start_id, end_id],
    inclusive, are reduced by multiplier.

  Raises:
    ValueError if any layer to be reduced does not have the 'num_outputs'
    attribute.
  """
  defs = copy.deepcopy(conv_defs)
  for d in defs['spec'][start_id:end_id+1]:
    d.params.update({
        'num_outputs': np.int(np.round(d.params['num_outputs'] * multiplier))
    })
  return defs


V3_LARGE_DETECTION = _reduce_consecutive_layers(V3_LARGE, 13, 16)
V3_SMALL_DETECTION = _reduce_consecutive_layers(V3_SMALL, 9, 12)


__all__ = ['training_scope', 'mobilenet', 'V3_LARGE', 'V3_SMALL', 'large',
           'small', 'V3_LARGE_DETECTION', 'V3_SMALL_DETECTION']