Commit 0016b0a7 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'dtk22.04' into 'main'

Dtk22.04

See merge request dcutoolkit/deeplearing/dlexamples_new!49
parents 17bc28d5 7a382d5d
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DarkNet models for KerasCV.
Reference:
- [YoloV4 Paper](https://arxiv.org/abs/1804.02767)
- [CSPNet Paper](https://arxiv.org/pdf/1911.11929)
- [YoloX Paper](https://arxiv.org/abs/2107.08430)
- [YoloX implementation](https://github.com/ultralytics/yolov3)
"""
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras_cv.models import utils
from keras_cv.models.__internal__.darknet_utils import CrossStagePartial
from keras_cv.models.__internal__.darknet_utils import DarknetConvBlock
from keras_cv.models.__internal__.darknet_utils import DarknetConvBlockDepthwise
from keras_cv.models.__internal__.darknet_utils import Focus
from keras_cv.models.__internal__.darknet_utils import SpatialPyramidPoolingBottleneck
def CSPDarkNet(
include_rescaling,
include_top,
depth_multiplier=1.0,
width_multiplier=1.0,
use_depthwise=False,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
name=None,
**kwargs,
):
"""Instantiates the CSPDarkNet architecture.
Although the DarkNet architecture is commonly used for detection tasks, it is
possible to extract the intermediate dark2 to dark5 layers from the model for
creating a feature pyramid Network.
Reference:
- [YoloV4 Paper](https://arxiv.org/abs/1804.02767)
- [CSPNet Paper](https://arxiv.org/pdf/1911.11929)
- [YoloX Paper](https://arxiv.org/abs/2107.08430)
- [YoloX implementation](https://github.com/ultralytics/yolov3)
For transfer learning use cases, make sure to read the
[guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/).
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected layer at the top of
the network. If provided, `classes` must be provided.
depth_multiplier: A float value used to calculate the base depth of the model
this changes based the detection model being used. Defaults to 1.0.
width_multiplier: A float value used to calculate the base width of the model
this changes based the detection model being used. Defaults to 1.0.
use_depthwise: a boolean value used to decide whether a depthwise conv block
should be used over a regular darknet block. Defaults to False
classes: optional number of classes to classify images into, only to be
specified if `include_top` is True.
weights: one of `None` (random initialization), or a pretrained weight
file path.
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
input_shape: optional shape tuple, defaults to (None, None, 3).
pooling: optional pooling mode for feature extraction when `include_top`
is `False`.
- `None` means that the output of the model will be the 4D tensor output
of the last convolutional block.
- `avg` means that global average pooling will be applied to the
output of the last convolutional block, and thus the output of the
model will be a 2D tensor.
- `max` means that global max pooling will be applied.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
name: (Optional) name to pass to the model. Defaults to "DarkNet".
Returns:
A `keras.Model` instance.
"""
if weights and not tf.io.gfile.exists(weights):
raise ValueError(
"The `weights` argument should be either `None` or the path to the "
f"weights file to be loaded. Weights file not found at location: {weights}"
)
if include_top and not classes:
raise ValueError(
"If `include_top` is True, you should specify `classes`. Received: "
f"classes={classes}"
)
ConvBlock = DarknetConvBlockDepthwise if use_depthwise else DarknetConvBlock
base_channels = int(width_multiplier * 64)
base_depth = max(round(depth_multiplier * 3), 1)
inputs = utils.parse_model_inputs(input_shape, input_tensor)
x = inputs
if include_rescaling:
x = layers.Rescaling(1 / 255.0)(x)
# stem
x = Focus(name="stem_focus")(x)
x = DarknetConvBlock(base_channels, kernel_size=3, strides=1, name="stem_conv")(x)
# dark2
x = ConvBlock(base_channels * 2, kernel_size=3, strides=2, name="dark2_conv")(x)
x = CrossStagePartial(
base_channels * 2,
num_bottlenecks=base_depth,
use_depthwise=use_depthwise,
name="dark2_csp",
)(x)
# dark3
x = ConvBlock(base_channels * 4, kernel_size=3, strides=2, name="dark3_conv")(x)
x = CrossStagePartial(
base_channels * 4,
num_bottlenecks=base_depth * 3,
use_depthwise=use_depthwise,
name="dark3_csp",
)(x)
# dark4
x = ConvBlock(base_channels * 8, kernel_size=3, strides=2, name="dark4_conv")(x)
x = CrossStagePartial(
base_channels * 8,
num_bottlenecks=base_depth * 3,
use_depthwise=use_depthwise,
name="dark4_csp",
)(x)
# dark5
x = ConvBlock(base_channels * 16, kernel_size=3, strides=2, name="dark5_conv")(x)
x = SpatialPyramidPoolingBottleneck(
base_channels * 16, hidden_filters=base_channels * 8, name="dark5_spp"
)(x)
x = CrossStagePartial(
base_channels * 16,
num_bottlenecks=base_depth,
residual=False,
use_depthwise=use_depthwise,
name="dark5_csp",
)(x)
if include_top:
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
x = layers.Dense(classes, activation=classifier_activation, name="predictions")(
x
)
elif pooling == "avg":
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
elif pooling == "max":
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
model = keras.Model(inputs, x, name=name, **kwargs)
if weights is not None:
model.load_weights(weights)
return model
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.models import csp_darknet
from .models_test import ModelsTest
MODEL_LIST = [
(csp_darknet.CSPDarkNet, 1024, {}),
]
class CSPDarkNetTest(ModelsTest, tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(*MODEL_LIST)
def test_application_base(self, app, _, args):
super()._test_application_base(app, _, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_with_rescaling(self, app, last_dim, args):
super()._test_application_with_rescaling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_pooling(self, app, last_dim, args):
super()._test_application_pooling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_variable_input_channels(self, app, last_dim, args):
super()._test_application_variable_input_channels(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_model_can_be_used_as_backbone(self, app, last_dim, args):
super()._test_model_can_be_used_as_backbone(app, last_dim, args)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DarkNet models for KerasCV.
Reference:
- [YoloV3 Paper](https://arxiv.org/abs/1804.02767)
- [YoloV3 implementation](https://github.com/ultralytics/yolov3)
"""
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras_cv.models import utils
from keras_cv.models.__internal__.darknet_utils import DarknetConvBlock
from keras_cv.models.__internal__.darknet_utils import ResidualBlocks
from keras_cv.models.__internal__.darknet_utils import SpatialPyramidPoolingBottleneck
BASE_DOCSTRING = """Instantiates the {name} architecture.
Although the {name} architecture is commonly used for detection tasks, it is
possible to extract the intermediate dark2 to dark5 layers from the model for
creating a feature pyramid Network.
Reference:
- [YoloV3 Paper](https://arxiv.org/abs/1804.02767)
- [YoloV3 implementation](https://github.com/ultralytics/yolov3)
For transfer learning use cases, make sure to read the
[guide to transfer learning & fine-tuning](
https://keras.io/guides/transfer_learning/).
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected layer at the top of
the network. If provided, `classes` must be provided.
classes: optional number of classes to classify images into, only to be
specified if `include_top` is True.
weights: one of `None` (random initialization), or a pretrained weight
file path.
input_shape: optional shape tuple, defaults to (None, None, 3).
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction when `include_top`
is `False`.
- `None` means that the output of the model will be the 4D tensor output
of the last convolutional block.
- `avg` means that global average pooling will be applied to the
output of the last convolutional block, and thus the output of the
model will be a 2D tensor.
- `max` means that global max pooling will be applied.
name: (Optional) name to pass to the model. Defaults to "{name}".
Returns:
A `keras.Model` instance.
"""
def DarkNet(
blocks,
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
name=None,
**kwargs,
):
"""Instantiates the DarkNet architecture.
Although the DarkNet architecture is commonly used for detection tasks, it is
possible to extract the intermediate dark2 to dark5 layers from the model for
creating a feature pyramid Network.
Reference:
- [YoloV3 Paper](https://arxiv.org/abs/1804.02767)
- [YoloV3 implementation](https://github.com/ultralytics/yolov3)
For transfer learning use cases, make sure to read the
[guide to transfer learning & fine-tuning](
https://keras.io/guides/transfer_learning/).
Args:
blocks: numbers of building blocks from the layer dark2 to layer dark5.
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected layer at the top of
the network. If provided, `classes` must be provided.
classes: optional number of classes to classify imagesinto, only to be
specified if `include_top` is True.
weights: one of `None` (random initialization), or a pretrained weight
file path.
input_shape: optional shape tuple, defaults to (None, None, 3).
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction when `include_top`
is `False`.
- `None` means that the output of the model will be the 4D tensor output
of the last convolutional block.
- `avg` means that global average pooling will be applied to the
output of the last convolutional block, and thus the output of the
model will be a 2D tensor.
- `max` means that global max pooling will be applied.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
name: (Optional) name to pass to the model. Defaults to "DarkNet".
Returns:
A `keras.Model` instance.
"""
if weights and not tf.io.gfile.exists(weights):
raise ValueError(
"The `weights` argument should be either `None` or the path to the "
f"weights file to be loaded. Weights file not found at location: {weights}"
)
if include_top and not classes:
raise ValueError(
"If `include_top` is True, you should specify `classes`. Received: "
f"classes={classes}"
)
inputs = utils.parse_model_inputs(input_shape, input_tensor)
x = inputs
if include_rescaling:
x = layers.Rescaling(1 / 255.0)(x)
# stem
x = DarknetConvBlock(
filters=32, kernel_size=3, strides=1, activation="leaky_relu", name="stem_conv"
)(x)
x = ResidualBlocks(filters=64, num_blocks=1, name="stem_residual_block")(x)
# filters for the ResidualBlock outputs
filters = [128, 256, 512, 1024]
# layer_num is used for naming the residual blocks (starts with dark2, hence 2)
layer_num = 2
for filter, block in zip(filters, blocks):
x = ResidualBlocks(
filters=filter, num_blocks=block, name=f"dark{layer_num}_residual_block"
)(x)
layer_num += 1
# remaining dark5 layers
x = DarknetConvBlock(
filters=512,
kernel_size=1,
strides=1,
activation="leaky_relu",
name="dark5_conv1",
)(x)
x = DarknetConvBlock(
filters=1024,
kernel_size=3,
strides=1,
activation="leaky_relu",
name="dark5_conv2",
)(x)
x = SpatialPyramidPoolingBottleneck(512, activation="leaky_relu", name="dark5_spp")(
x
)
x = DarknetConvBlock(
filters=1024,
kernel_size=3,
strides=1,
activation="leaky_relu",
name="dark5_conv3",
)(x)
x = DarknetConvBlock(
filters=512,
kernel_size=1,
strides=1,
activation="leaky_relu",
name="dark5_conv4",
)(x)
if include_top:
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
x = layers.Dense(classes, activation=classifier_activation, name="predictions")(
x
)
elif pooling == "avg":
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
elif pooling == "max":
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
model = keras.Model(inputs, x, name=name, **kwargs)
if weights is not None:
model.load_weights(weights)
return model
def DarkNet21(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
name="DarkNet21",
**kwargs,
):
return DarkNet(
[1, 2, 2, 1],
include_rescaling=include_rescaling,
include_top=include_top,
classes=classes,
weights=weights,
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
name=name,
**kwargs,
)
def DarkNet53(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
name="DarkNet53",
**kwargs,
):
return DarkNet(
[2, 8, 8, 4],
include_rescaling=include_rescaling,
include_top=include_top,
classes=classes,
weights=weights,
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
name=name,
**kwargs,
)
setattr(DarkNet21, "__doc__", BASE_DOCSTRING.format(name="DarkNet21"))
setattr(DarkNet53, "__doc__", BASE_DOCSTRING.format(name="DarkNet53"))
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.models import darknet
from .models_test import ModelsTest
MODEL_LIST = [
(darknet.DarkNet21, 512, {}),
(darknet.DarkNet53, 512, {}),
]
class DarkNetTest(ModelsTest, tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(*MODEL_LIST)
def test_application_base(self, app, _, args):
super()._test_application_base(app, _, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_with_rescaling(self, app, last_dim, args):
super()._test_application_with_rescaling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_pooling(self, app, last_dim, args):
super()._test_application_pooling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_variable_input_channels(self, app, last_dim, args):
super()._test_application_variable_input_channels(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_model_can_be_used_as_backbone(self, app, last_dim, args):
super()._test_model_can_be_used_as_backbone(app, last_dim, args)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DenseNet models for KerasCV.
Reference:
- [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993)
- [Based on the Original keras.applications DenseNet](https://github.com/keras-team/keras/blob/master/keras/applications/densenet.py)
"""
from tensorflow import keras
from tensorflow.keras import backend
from tensorflow.keras import layers
from keras_cv.models import utils
from keras_cv.models.weights import parse_weights
BN_AXIS = 3
BASE_DOCSTRING = """Instantiates the {name} architecture.
Reference:
- [Densely Connected Convolutional Networks (CVPR 2017)](https://arxiv.org/abs/1608.06993)
This function returns a Keras {name} model.
For transfer learning use cases, make sure to read the [guide to transfer
learning & fine-tuning](https://keras.io/guides/transfer_learning/).
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected layer at the top of the
network. If provided, classes must be provided.
classes: optional number of classes to classify images into, only to be
specified if `include_top` is True.
weights: one of `None` (random initialization), a pretrained weight file
path, or a reference to pre-trained weights (e.g. 'imagenet/classification')
(see available pre-trained weights in weights.py)
input_shape: optional shape tuple, defaults to (None, None, 3).
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction
when `include_top` is `False`.
- `None` means that the output of the model will be the 4D tensor output
of the last convolutional block.
- `avg` means that global average pooling will be applied to the output
of the last convolutional block, and thus the output of the model will
be a 2D tensor.
- `max` means that global max pooling will be applied.
name: (Optional) name to pass to the model. Defaults to "{name}".
Returns:
A `keras.Model` instance.
"""
def DenseBlock(blocks, name=None):
"""A dense block.
Args:
blocks: integer, the number of building blocks.
name: string, block label.
Returns:
a function that takes an input Tensor representing a DenseBlock.
"""
if name is None:
name = f"dense_block_{backend.get_uid('dense_block')}"
def apply(x):
for i in range(blocks):
x = ConvBlock(32, name=f"{name}_block_{i}")(x)
return x
return apply
def TransitionBlock(reduction, name=None):
"""A transition block.
Args:
reduction: float, compression rate at transition layers.
name: string, block label.
Returns:
a function that takes an input Tensor representing a TransitionBlock.
"""
if name is None:
name = f"transition_block_{backend.get_uid('transition_block')}"
def apply(x):
x = layers.BatchNormalization(
axis=BN_AXIS, epsilon=1.001e-5, name=f"{name}_bn"
)(x)
x = layers.Activation("relu", name=f"{name}_relu")(x)
x = layers.Conv2D(
int(backend.int_shape(x)[BN_AXIS] * reduction),
1,
use_bias=False,
name=f"{name}_conv",
)(x)
x = layers.AveragePooling2D(2, strides=2, name=f"{name}_pool")(x)
return x
return apply
def ConvBlock(growth_rate, name=None):
"""A building block for a dense block.
Args:
growth_rate: float, growth rate at dense layers.
name: string, block label.
Returns:
a function that takes an input Tensor representing a ConvBlock.
"""
if name is None:
name = f"conv_block_{backend.get_uid('conv_block')}"
def apply(x):
x1 = layers.BatchNormalization(
axis=BN_AXIS, epsilon=1.001e-5, name=f"{name}_0_bn"
)(x)
x1 = layers.Activation("relu", name=f"{name}_0_relu")(x1)
x1 = layers.Conv2D(4 * growth_rate, 1, use_bias=False, name=f"{name}_1_conv")(
x1
)
x1 = layers.BatchNormalization(
axis=BN_AXIS, epsilon=1.001e-5, name=f"{name}_1_bn"
)(x1)
x1 = layers.Activation("relu", name=f"{name}_1_relu")(x1)
x1 = layers.Conv2D(
growth_rate, 3, padding="same", use_bias=False, name=f"{name}_2_conv"
)(x1)
x = layers.Concatenate(axis=BN_AXIS, name=f"{name}_concat")([x, x1])
return x
return apply
def DenseNet(
blocks,
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
name="DenseNet",
**kwargs,
):
"""Instantiates the DenseNet architecture.
Reference:
- [Densely Connected Convolutional Networks (CVPR 2017)](https://arxiv.org/abs/1608.06993)
This function returns a Keras DenseNet model.
For transfer learning use cases, make sure to read the [guide to transfer
learning & fine-tuning](https://keras.io/guides/transfer_learning/).
Args:
blocks: numbers of building blocks for the four dense layers.
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected layer at the top of the
network. If provided, classes must be provided.
classes: optional number of classes to classify images into, only to be
specified if `include_top` is True.
weights: one of `None` (random initialization), or a pretrained weight file
path.
input_shape: optional shape tuple, defaults to (None, None, 3).
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction
when `include_top` is `False`.
- `None` means that the output of the model will be the 4D tensor output
of the last convolutional block.
- `avg` means that global average pooling will be applied to the output
of the last convolutional block, and thus the output of the model will
be a 2D tensor.
- `max` means that global max pooling will be applied.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
name: (Optional) name to pass to the model. Defaults to "DenseNet".
Returns:
A `keras.Model` instance.
"""
if include_top and not classes:
raise ValueError(
"If `include_top` is True, you should specify `classes`. "
f"Received: classes={classes}"
)
inputs = utils.parse_model_inputs(input_shape, input_tensor)
x = inputs
if include_rescaling:
x = layers.Rescaling(1 / 255.0)(x)
x = layers.Conv2D(
64, 7, strides=2, use_bias=False, padding="same", name="conv1/conv"
)(x)
x = layers.BatchNormalization(axis=BN_AXIS, epsilon=1.001e-5, name="conv1/bn")(x)
x = layers.Activation("relu", name="conv1/relu")(x)
x = layers.MaxPooling2D(3, strides=2, padding="same", name="pool1")(x)
x = DenseBlock(blocks[0], name="conv2")(x)
x = TransitionBlock(0.5, name="pool2")(x)
x = DenseBlock(blocks[1], name="conv3")(x)
x = TransitionBlock(0.5, name="pool3")(x)
x = DenseBlock(blocks[2], name="conv4")(x)
x = TransitionBlock(0.5, name="pool4")(x)
x = DenseBlock(blocks[3], name="conv5")(x)
x = layers.BatchNormalization(axis=BN_AXIS, epsilon=1.001e-5, name="bn")(x)
x = layers.Activation("relu", name="relu")(x)
if include_top:
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
x = layers.Dense(classes, activation=classifier_activation, name="predictions")(
x
)
elif pooling == "avg":
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
elif pooling == "max":
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
model = keras.Model(inputs, x, name=name, **kwargs)
if weights is not None:
model.load_weights(weights)
return model
def DenseNet121(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
name="DenseNet121",
**kwargs,
):
return DenseNet(
[6, 12, 24, 16],
include_rescaling=include_rescaling,
include_top=include_top,
classes=classes,
weights=parse_weights(weights, include_top, "densenet121"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
name=name,
**kwargs,
)
def DenseNet169(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
name="DenseNet169",
**kwargs,
):
return DenseNet(
[6, 12, 32, 32],
include_rescaling=include_rescaling,
include_top=include_top,
classes=classes,
weights=parse_weights(weights, include_top, "densenet169"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
name=name,
**kwargs,
)
def DenseNet201(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
name="DenseNet201",
**kwargs,
):
return DenseNet(
blocks=[6, 12, 48, 32],
include_rescaling=include_rescaling,
include_top=include_top,
classes=classes,
weights=parse_weights(weights, include_top, "densenet201"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
name=name,
**kwargs,
)
setattr(DenseNet121, "__doc__", BASE_DOCSTRING.format(name="DenseNet121"))
setattr(DenseNet169, "__doc__", BASE_DOCSTRING.format(name="DenseNet169"))
setattr(DenseNet201, "__doc__", BASE_DOCSTRING.format(name="DenseNet201"))
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.models import densenet
from .models_test import ModelsTest
MODEL_LIST = [
(densenet.DenseNet121, 1024, {}),
(densenet.DenseNet169, 1664, {}),
(densenet.DenseNet201, 1920, {}),
]
class DenseNetTest(ModelsTest, tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(*MODEL_LIST)
def test_application_base(self, app, _, args):
super()._test_application_base(app, _, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_with_rescaling(self, app, last_dim, args):
super()._test_application_with_rescaling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_pooling(self, app, last_dim, args):
super()._test_application_pooling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_variable_input_channels(self, app, last_dim, args):
super()._test_application_variable_input_channels(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_model_can_be_used_as_backbone(self, app, last_dim, args):
super()._test_model_can_be_used_as_backbone(app, last_dim, args)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""EfficientNet models for Keras.
Reference:
- [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
https://arxiv.org/abs/1905.11946) (ICML 2019)
- [Based on the original keras.applications EfficientNet](https://github.com/keras-team/keras/blob/master/keras/applications/efficientnet.py)
"""
import copy
import math
import tensorflow as tf
from keras import backend
from keras import layers
from keras_cv.models import utils
from keras_cv.models.weights import parse_weights
DEFAULT_BLOCKS_ARGS = [
{
"kernel_size": 3,
"repeats": 1,
"filters_in": 32,
"filters_out": 16,
"expand_ratio": 1,
"id_skip": True,
"strides": 1,
"se_ratio": 0.25,
},
{
"kernel_size": 3,
"repeats": 2,
"filters_in": 16,
"filters_out": 24,
"expand_ratio": 6,
"id_skip": True,
"strides": 2,
"se_ratio": 0.25,
},
{
"kernel_size": 5,
"repeats": 2,
"filters_in": 24,
"filters_out": 40,
"expand_ratio": 6,
"id_skip": True,
"strides": 2,
"se_ratio": 0.25,
},
{
"kernel_size": 3,
"repeats": 3,
"filters_in": 40,
"filters_out": 80,
"expand_ratio": 6,
"id_skip": True,
"strides": 2,
"se_ratio": 0.25,
},
{
"kernel_size": 5,
"repeats": 3,
"filters_in": 80,
"filters_out": 112,
"expand_ratio": 6,
"id_skip": True,
"strides": 1,
"se_ratio": 0.25,
},
{
"kernel_size": 5,
"repeats": 4,
"filters_in": 112,
"filters_out": 192,
"expand_ratio": 6,
"id_skip": True,
"strides": 2,
"se_ratio": 0.25,
},
{
"kernel_size": 3,
"repeats": 1,
"filters_in": 192,
"filters_out": 320,
"expand_ratio": 6,
"id_skip": True,
"strides": 1,
"se_ratio": 0.25,
},
]
CONV_KERNEL_INITIALIZER = {
"class_name": "VarianceScaling",
"config": {
"scale": 2.0,
"mode": "fan_out",
"distribution": "truncated_normal",
},
}
DENSE_KERNEL_INITIALIZER = {
"class_name": "VarianceScaling",
"config": {
"scale": 1.0 / 3.0,
"mode": "fan_out",
"distribution": "uniform",
},
}
BASE_DOCSTRING = """Instantiates the {name} architecture.
Reference:
- [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
https://arxiv.org/abs/1905.11946) (ICML 2019)
This function returns a Keras image classification model,
optionally loaded with weights pre-trained on ImageNet.
For image classification use cases, see
[this page for detailed examples](
https://keras.io/api/applications/#usage-examples-for-image-classification-models).
For transfer learning use cases, make sure to read the
[guide to transfer learning & fine-tuning](
https://keras.io/guides/transfer_learning/).
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: Whether to include the fully-connected
layer at the top of the network.
weights: One of `None` (random initialization),
or the path to the weights file to be loaded.
input_shape: Optional shape tuple.
It should have exactly 3 inputs channels.
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: Optional pooling mode for feature extraction
when `include_top` is `False`. Defaults to None.
- `None` means that the output of the model will be
the 4D tensor output of the
last convolutional layer.
- `avg` means that global average pooling
will be applied to the output of the
last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will
be applied.
classes: Optional number of classes to classify images
into, only to be specified if `include_top` is True, and
if no `weights` argument is specified. Defaults to None.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
Defaults to 'softmax'.
When loading pretrained weights, `classifier_activation` can only
be `None` or `"softmax"`.
Returns:
A `keras.Model` instance.
"""
BN_AXIS = 3
def correct_pad(inputs, kernel_size):
"""Returns a tuple for zero-padding for 2D convolution with downsampling.
Args:
inputs: Input tensor.
kernel_size: An integer or tuple/list of 2 integers.
Returns:
A tuple.
"""
img_dim = 1
input_size = backend.int_shape(inputs)[img_dim : (img_dim + 2)]
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
if input_size[0] is None:
adjust = (1, 1)
else:
adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
correct = (kernel_size[0] // 2, kernel_size[1] // 2)
return (
(correct[0] - adjust[0], correct[0]),
(correct[1] - adjust[1], correct[1]),
)
def EfficientNetBlock(
activation="swish",
drop_rate=0.0,
name="",
filters_in=32,
filters_out=16,
kernel_size=3,
strides=1,
expand_ratio=1,
se_ratio=0.0,
id_skip=True,
):
"""An inverted residual block.
Args:
inputs: input tensor.
activation: activation function.
drop_rate: float between 0 and 1, fraction of the input units to drop.
name: string, block label.
filters_in: integer, the number of input filters.
filters_out: integer, the number of output filters.
kernel_size: integer, the dimension of the convolution window.
strides: integer, the stride of the convolution.
expand_ratio: integer, scaling coefficient for the input filters.
se_ratio: float between 0 and 1, fraction to squeeze the input filters.
id_skip: boolean.
Returns:
output tensor for the block.
"""
# Expansion phase
def apply(inputs):
filters = filters_in * expand_ratio
if expand_ratio != 1:
x = layers.Conv2D(
filters,
1,
padding="same",
use_bias=False,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "expand_conv",
)(inputs)
x = layers.BatchNormalization(axis=BN_AXIS, name=name + "expand_bn")(x)
x = layers.Activation(activation, name=name + "expand_activation")(x)
else:
x = inputs
# Depthwise Convolution
if strides == 2:
x = layers.ZeroPadding2D(
padding=correct_pad(x, kernel_size),
name=name + "dwconv_pad",
)(x)
conv_pad = "valid"
else:
conv_pad = "same"
x = layers.DepthwiseConv2D(
kernel_size,
strides=strides,
padding=conv_pad,
use_bias=False,
depthwise_initializer=CONV_KERNEL_INITIALIZER,
name=name + "dwconv",
)(x)
x = layers.BatchNormalization(axis=BN_AXIS, name=name + "bn")(x)
x = layers.Activation(activation, name=name + "activation")(x)
# Squeeze and Excitation phase
if 0 < se_ratio <= 1:
filters_se = max(1, int(filters_in * se_ratio))
se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
if BN_AXIS == 1:
se_shape = (filters, 1, 1)
else:
se_shape = (1, 1, filters)
se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
se = layers.Conv2D(
filters_se,
1,
padding="same",
activation=activation,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_reduce",
)(se)
se = layers.Conv2D(
filters,
1,
padding="same",
activation="sigmoid",
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_expand",
)(se)
x = layers.multiply([x, se], name=name + "se_excite")
# Output phase
x = layers.Conv2D(
filters_out,
1,
padding="same",
use_bias=False,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "project_conv",
)(x)
x = layers.BatchNormalization(axis=BN_AXIS, name=name + "project_bn")(x)
if id_skip and strides == 1 and filters_in == filters_out:
if drop_rate > 0:
x = layers.Dropout(
drop_rate, noise_shape=(None, 1, 1, 1), name=name + "drop"
)(x)
x = layers.add([x, inputs], name=name + "add")
return x
return apply
def EfficientNet(
include_rescaling,
include_top,
width_coefficient,
depth_coefficient,
default_size,
dropout_rate=0.2,
drop_connect_rate=0.2,
depth_divisor=8,
activation="swish",
blocks_args="default",
model_name="efficientnet",
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
):
"""Instantiates the EfficientNet architecture using given scaling coefficients.
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected
layer at the top of the network.
width_coefficient: float, scaling coefficient for network width.
depth_coefficient: float, scaling coefficient for network depth.
default_size: integer, default input image size.
dropout_rate: float, dropout rate before final classifier layer.
drop_connect_rate: float, dropout rate at skip connections.
depth_divisor: integer, a unit of network width.
activation: activation function.
blocks_args: list of dicts, parameters to construct block modules.
model_name: string, model name.
weights: one of `None` (random initialization),
or the path to the weights file to be loaded.
input_shape: optional shape tuple,
It should have exactly 3 inputs channels.
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction
when `include_top` is `False`.
- `None` means that the output of the model will be
the 4D tensor output of the
last convolutional layer.
- `avg` means that global average pooling
will be applied to the output of the
last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will
be applied.
classes: optional number of classes to classify images
into, only to be specified if `include_top` is True, and
if no `weights` argument is specified.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
Returns:
A `keras.Model` instance.
Raises:
ValueError: in case of invalid argument for `weights`,
or invalid input shape.
ValueError: if `classifier_activation` is not `softmax` or `None` when
using a pretrained top layer.
"""
if blocks_args == "default":
blocks_args = DEFAULT_BLOCKS_ARGS
if weights and not tf.io.gfile.exists(weights):
raise ValueError(
"The `weights` argument should be either `None` or the path to the "
"weights file to be loaded. Weights file not found at location: {weights}"
)
if include_top and not classes:
raise ValueError(
"If `include_top` is True, you should specify `classes`. "
f"Received: classes={classes}"
)
if include_top and pooling:
raise ValueError(
f"`pooling` must be `None` when `include_top=True`."
f"Received pooling={pooling} and include_top={include_top}. "
)
img_input = utils.parse_model_inputs(input_shape, input_tensor)
def round_filters(filters, divisor=depth_divisor):
"""Round number of filters based on depth multiplier."""
filters *= width_coefficient
new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_filters < 0.9 * filters:
new_filters += divisor
return int(new_filters)
def round_repeats(repeats):
"""Round number of repeats based on depth multiplier."""
return int(math.ceil(depth_coefficient * repeats))
# Build stem
x = img_input
if include_rescaling:
# Use common rescaling strategy across keras_cv
x = layers.Rescaling(1.0 / 255.0)(x)
x = layers.ZeroPadding2D(padding=correct_pad(x, 3), name="stem_conv_pad")(x)
x = layers.Conv2D(
round_filters(32),
3,
strides=2,
padding="valid",
use_bias=False,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name="stem_conv",
)(x)
x = layers.BatchNormalization(axis=BN_AXIS, name="stem_bn")(x)
x = layers.Activation(activation, name="stem_activation")(x)
# Build blocks
blocks_args = copy.deepcopy(blocks_args)
b = 0
blocks = float(sum(round_repeats(args["repeats"]) for args in blocks_args))
for (i, args) in enumerate(blocks_args):
assert args["repeats"] > 0
# Update block input and output filters based on depth multiplier.
args["filters_in"] = round_filters(args["filters_in"])
args["filters_out"] = round_filters(args["filters_out"])
for j in range(round_repeats(args.pop("repeats"))):
# The first block needs to take care of stride and filter size
# increase.
if j > 0:
args["strides"] = 1
args["filters_in"] = args["filters_out"]
x = EfficientNetBlock(
activation,
drop_connect_rate * b / blocks,
name="block{}{}_".format(i + 1, chr(j + 97)),
**args,
)(x)
b += 1
# Build top
x = layers.Conv2D(
round_filters(1280),
1,
padding="same",
use_bias=False,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name="top_conv",
)(x)
x = layers.BatchNormalization(axis=BN_AXIS, name="top_bn")(x)
x = layers.Activation(activation, name="top_activation")(x)
if include_top:
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
if dropout_rate > 0:
x = layers.Dropout(dropout_rate, name="top_dropout")(x)
x = layers.Dense(
classes,
activation=classifier_activation,
kernel_initializer=DENSE_KERNEL_INITIALIZER,
name="predictions",
)(x)
else:
if pooling == "avg":
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
elif pooling == "max":
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
inputs = img_input
# Create model.
model = tf.keras.Model(inputs, x, name=model_name)
# Load weights.
if weights is not None:
model.load_weights(weights)
return model
def EfficientNetB0(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.0,
depth_coefficient=1.0,
default_size=224,
dropout_rate=0.2,
model_name="efficientnetb0",
weights=parse_weights(weights, include_top, "efficientnetb0"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB1(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.0,
depth_coefficient=1.1,
default_size=240,
dropout_rate=0.2,
model_name="efficientnetb1",
weights=parse_weights(weights, include_top, "efficientnetb1"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB2(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.1,
depth_coefficient=1.2,
default_size=260,
dropout_rate=0.3,
model_name="efficientnetb2",
weights=parse_weights(weights, include_top, "efficientnetb2"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB3(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.2,
depth_coefficient=1.4,
default_size=300,
dropout_rate=0.3,
model_name="efficientnetb3",
weights=parse_weights(weights, include_top, "efficientnetb3"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB4(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.4,
depth_coefficient=1.8,
default_size=380,
dropout_rate=0.4,
model_name="efficientnetb4",
weights=parse_weights(weights, include_top, "efficientnetb4"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB5(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.6,
depth_coefficient=2.2,
default_size=456,
dropout_rate=0.4,
model_name="efficientnetb5",
weights=parse_weights(weights, include_top, "efficientnetb5"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB6(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=1.8,
depth_coefficient=2.6,
default_size=528,
dropout_rate=0.5,
model_name="efficientnetb6",
weights=parse_weights(weights, include_top, "efficientnetb6"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetB7(
include_rescaling,
include_top,
classes=None,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNet(
include_rescaling,
include_top,
width_coefficient=2.0,
depth_coefficient=3.1,
default_size=600,
dropout_rate=0.5,
model_name="efficientnetb7",
weights=parse_weights(weights, include_top, "efficientnetb7"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
EfficientNetB0.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB0")
EfficientNetB1.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB1")
EfficientNetB2.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB2")
EfficientNetB3.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB3")
EfficientNetB4.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB4")
EfficientNetB5.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB5")
EfficientNetB6.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB6")
EfficientNetB7.__doc__ = BASE_DOCSTRING.format(name="EfficientNetB7")
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.models import efficientnet_v1
from .models_test import ModelsTest
MODEL_LIST = [
(efficientnet_v1.EfficientNetB0, 1280, {}),
(efficientnet_v1.EfficientNetB1, 1280, {}),
(efficientnet_v1.EfficientNetB2, 1408, {}),
(efficientnet_v1.EfficientNetB3, 1536, {}),
(efficientnet_v1.EfficientNetB4, 1792, {}),
(efficientnet_v1.EfficientNetB5, 2048, {}),
(efficientnet_v1.EfficientNetB6, 2304, {}),
(efficientnet_v1.EfficientNetB7, 2560, {}),
]
class EfficientNetV1Test(ModelsTest, tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(*MODEL_LIST)
def test_application_base(self, app, _, args):
super()._test_application_base(app, _, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_with_rescaling(self, app, last_dim, args):
super()._test_application_with_rescaling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_pooling(self, app, last_dim, args):
super()._test_application_pooling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_variable_input_channels(self, app, last_dim, args):
super()._test_application_variable_input_channels(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_model_can_be_used_as_backbone(self, app, last_dim, args):
super()._test_model_can_be_used_as_backbone(app, last_dim, args)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""EfficientNet V2 models for KerasCV.
Reference:
- [EfficientNetV2: Smaller Models and Faster Training](
https://arxiv.org/abs/2104.00298) (ICML 2021)
- [Based on the original keras.applications EfficientNetV2](https://github.com/keras-team/keras/blob/master/keras/applications/efficientnet_v2.py)
"""
import copy
import math
import tensorflow as tf
from keras import backend
from keras import layers
from keras_cv.models import utils
from keras_cv.models.weights import parse_weights
DEFAULT_BLOCKS_ARGS = {
"efficientnetv2-s": [
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 24,
"output_filters": 24,
"expand_ratio": 1,
"se_ratio": 0.0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 4,
"input_filters": 24,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0.0,
"strides": 2,
"conv_type": 1,
},
{
"conv_type": 1,
"expand_ratio": 4,
"input_filters": 48,
"kernel_size": 3,
"num_repeat": 4,
"output_filters": 64,
"se_ratio": 0,
"strides": 2,
},
{
"conv_type": 0,
"expand_ratio": 4,
"input_filters": 64,
"kernel_size": 3,
"num_repeat": 6,
"output_filters": 128,
"se_ratio": 0.25,
"strides": 2,
},
{
"conv_type": 0,
"expand_ratio": 6,
"input_filters": 128,
"kernel_size": 3,
"num_repeat": 9,
"output_filters": 160,
"se_ratio": 0.25,
"strides": 1,
},
{
"conv_type": 0,
"expand_ratio": 6,
"input_filters": 160,
"kernel_size": 3,
"num_repeat": 15,
"output_filters": 256,
"se_ratio": 0.25,
"strides": 2,
},
],
"efficientnetv2-m": [
{
"kernel_size": 3,
"num_repeat": 3,
"input_filters": 24,
"output_filters": 24,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 24,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 48,
"output_filters": 80,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 7,
"input_filters": 80,
"output_filters": 160,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 14,
"input_filters": 160,
"output_filters": 176,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 18,
"input_filters": 176,
"output_filters": 304,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 304,
"output_filters": 512,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
],
"efficientnetv2-l": [
{
"kernel_size": 3,
"num_repeat": 4,
"input_filters": 32,
"output_filters": 32,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 7,
"input_filters": 32,
"output_filters": 64,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 7,
"input_filters": 64,
"output_filters": 96,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 10,
"input_filters": 96,
"output_filters": 192,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 19,
"input_filters": 192,
"output_filters": 224,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 25,
"input_filters": 224,
"output_filters": 384,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 7,
"input_filters": 384,
"output_filters": 640,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
],
"efficientnetv2-b0": [
{
"kernel_size": 3,
"num_repeat": 1,
"input_filters": 32,
"output_filters": 16,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 16,
"output_filters": 32,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 32,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 3,
"input_filters": 48,
"output_filters": 96,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 96,
"output_filters": 112,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 8,
"input_filters": 112,
"output_filters": 192,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
],
"efficientnetv2-b1": [
{
"kernel_size": 3,
"num_repeat": 1,
"input_filters": 32,
"output_filters": 16,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 16,
"output_filters": 32,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 32,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 3,
"input_filters": 48,
"output_filters": 96,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 96,
"output_filters": 112,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 8,
"input_filters": 112,
"output_filters": 192,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
],
"efficientnetv2-b2": [
{
"kernel_size": 3,
"num_repeat": 1,
"input_filters": 32,
"output_filters": 16,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 16,
"output_filters": 32,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 32,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 3,
"input_filters": 48,
"output_filters": 96,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 96,
"output_filters": 112,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 8,
"input_filters": 112,
"output_filters": 192,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
],
"efficientnetv2-b3": [
{
"kernel_size": 3,
"num_repeat": 1,
"input_filters": 32,
"output_filters": 16,
"expand_ratio": 1,
"se_ratio": 0,
"strides": 1,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 16,
"output_filters": 32,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 2,
"input_filters": 32,
"output_filters": 48,
"expand_ratio": 4,
"se_ratio": 0,
"strides": 2,
"conv_type": 1,
},
{
"kernel_size": 3,
"num_repeat": 3,
"input_filters": 48,
"output_filters": 96,
"expand_ratio": 4,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 5,
"input_filters": 96,
"output_filters": 112,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 1,
"conv_type": 0,
},
{
"kernel_size": 3,
"num_repeat": 8,
"input_filters": 112,
"output_filters": 192,
"expand_ratio": 6,
"se_ratio": 0.25,
"strides": 2,
"conv_type": 0,
},
],
}
CONV_KERNEL_INITIALIZER = {
"class_name": "VarianceScaling",
"config": {
"scale": 2.0,
"mode": "fan_out",
"distribution": "truncated_normal",
},
}
DENSE_KERNEL_INITIALIZER = {
"class_name": "VarianceScaling",
"config": {
"scale": 1.0 / 3.0,
"mode": "fan_out",
"distribution": "uniform",
},
}
BN_AXIS = 3
BASE_DOCSTRING = """Instantiates the {name} architecture.
Reference:
- [EfficientNetV2: Smaller Models and Faster Training](
https://arxiv.org/abs/2104.00298) (ICML 2021)
This function returns a Keras image classification model,
optionally loaded with weights pre-trained on ImageNet.
For image classification use cases, see
[this page for detailed examples](
https://keras.io/api/applications/#usage-examples-for-image-classification-models).
For transfer learning use cases, make sure to read the
[guide to transfer learning & fine-tuning](
https://keras.io/guides/transfer_learning/).
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: Whether to include the fully-connected
layer at the top of the network.
weights: one of `None` (random initialization), a pretrained weight file
path, or a reference to pre-trained weights (e.g. 'imagenet/classification')
(see available pre-trained weights in weights.py)
input_shape: Optional shape tuple.
It should have exactly 3 inputs channels.
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: Optional pooling mode for feature extraction
when `include_top` is `False`. Defaults to None.
- `None` means that the output of the model will be
the 4D tensor output of the
last convolutional layer.
- `avg` means that global average pooling
will be applied to the output of the
last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will
be applied.
classes: Optional number of lasses to classify images
into, only to be specified if `include_top` is True, and
if no `weights` argument is specified. Defaults to None.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
Defaults to 'softmax'.
When loading pretrained weights, `classifier_activation` can only
be `None` or `"softmax"`.
Returns:
A `keras.Model` instance.
"""
def round_filters(filters, width_coefficient, min_depth, depth_divisor):
"""Round number of filters based on depth multiplier."""
filters *= width_coefficient
minimum_depth = min_depth or depth_divisor
new_filters = max(
minimum_depth,
int(filters + depth_divisor / 2) // depth_divisor * depth_divisor,
)
return int(new_filters)
def round_repeats(repeats, depth_coefficient):
"""Round number of repeats based on depth multiplier."""
return int(math.ceil(depth_coefficient * repeats))
def MBConvBlock(
input_filters: int,
output_filters: int,
expand_ratio=1,
kernel_size=3,
strides=1,
se_ratio=0.0,
bn_momentum=0.9,
activation="swish",
survival_probability: float = 0.8,
name=None,
):
"""MBConv block: Mobile Inverted Residual Bottleneck."""
if name is None:
name = backend.get_uid("block0")
def apply(inputs):
# Expansion phase
filters = input_filters * expand_ratio
if expand_ratio != 1:
x = layers.Conv2D(
filters=filters,
kernel_size=1,
strides=1,
kernel_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
data_format="channels_last",
use_bias=False,
name=name + "expand_conv",
)(inputs)
x = layers.BatchNormalization(
axis=BN_AXIS,
momentum=bn_momentum,
name=name + "expand_bn",
)(x)
x = layers.Activation(activation, name=name + "expand_activation")(x)
else:
x = inputs
# Depthwise conv
x = layers.DepthwiseConv2D(
kernel_size=kernel_size,
strides=strides,
depthwise_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
data_format="channels_last",
use_bias=False,
name=name + "dwconv2",
)(x)
x = layers.BatchNormalization(
axis=BN_AXIS, momentum=bn_momentum, name=name + "bn"
)(x)
x = layers.Activation(activation, name=name + "activation")(x)
# Squeeze and excite
if 0 < se_ratio <= 1:
filters_se = max(1, int(input_filters * se_ratio))
se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
if BN_AXIS == 1:
se_shape = (filters, 1, 1)
else:
se_shape = (1, 1, filters)
se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
se = layers.Conv2D(
filters_se,
1,
padding="same",
activation=activation,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_reduce",
)(se)
se = layers.Conv2D(
filters,
1,
padding="same",
activation="sigmoid",
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_expand",
)(se)
x = layers.multiply([x, se], name=name + "se_excite")
# Output phase
x = layers.Conv2D(
filters=output_filters,
kernel_size=1,
strides=1,
kernel_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
data_format="channels_last",
use_bias=False,
name=name + "project_conv",
)(x)
x = layers.BatchNormalization(
axis=BN_AXIS, momentum=bn_momentum, name=name + "project_bn"
)(x)
if strides == 1 and input_filters == output_filters:
if survival_probability:
x = layers.Dropout(
survival_probability,
noise_shape=(None, 1, 1, 1),
name=name + "drop",
)(x)
x = layers.add([x, inputs], name=name + "add")
return x
return apply
def FusedMBConvBlock(
input_filters: int,
output_filters: int,
expand_ratio=1,
kernel_size=3,
strides=1,
se_ratio=0.0,
bn_momentum=0.9,
activation="swish",
survival_probability: float = 0.8,
name=None,
):
"""Fused MBConv Block: Fusing the proj conv1x1 and depthwise_conv into a
conv2d."""
if name is None:
name = backend.get_uid("block0")
def apply(inputs):
filters = input_filters * expand_ratio
if expand_ratio != 1:
x = layers.Conv2D(
filters,
kernel_size=kernel_size,
strides=strides,
kernel_initializer=CONV_KERNEL_INITIALIZER,
data_format="channels_last",
padding="same",
use_bias=False,
name=name + "expand_conv",
)(inputs)
x = layers.BatchNormalization(
axis=BN_AXIS, momentum=bn_momentum, name=name + "expand_bn"
)(x)
x = layers.Activation(
activation=activation, name=name + "expand_activation"
)(x)
else:
x = inputs
# Squeeze and excite
if 0 < se_ratio <= 1:
filters_se = max(1, int(input_filters * se_ratio))
se = layers.GlobalAveragePooling2D(name=name + "se_squeeze")(x)
if BN_AXIS == 1:
se_shape = (filters, 1, 1)
else:
se_shape = (1, 1, filters)
se = layers.Reshape(se_shape, name=name + "se_reshape")(se)
se = layers.Conv2D(
filters_se,
1,
padding="same",
activation=activation,
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_reduce",
)(se)
se = layers.Conv2D(
filters,
1,
padding="same",
activation="sigmoid",
kernel_initializer=CONV_KERNEL_INITIALIZER,
name=name + "se_expand",
)(se)
x = layers.multiply([x, se], name=name + "se_excite")
# Output phase:
x = layers.Conv2D(
output_filters,
kernel_size=1 if expand_ratio != 1 else kernel_size,
strides=1 if expand_ratio != 1 else strides,
kernel_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
use_bias=False,
name=name + "project_conv",
)(x)
x = layers.BatchNormalization(
axis=BN_AXIS, momentum=bn_momentum, name=name + "project_bn"
)(x)
if expand_ratio == 1:
x = layers.Activation(
activation=activation, name=name + "project_activation"
)(x)
# Residual:
if strides == 1 and input_filters == output_filters:
if survival_probability:
x = layers.Dropout(
survival_probability,
noise_shape=(None, 1, 1, 1),
name=name + "drop",
)(x)
x = layers.add([x, inputs], name=name + "add")
return x
return apply
def EfficientNetV2(
include_rescaling,
include_top,
width_coefficient,
depth_coefficient,
default_size,
dropout_rate=0.2,
drop_connect_rate=0.2,
depth_divisor=8,
min_depth=8,
bn_momentum=0.9,
activation="swish",
blocks_args="default",
model_name="efficientnet",
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
"""Instantiates the EfficientNetV2 architecture using given scaling
coefficients.
Args:
include_rescaling: whether or not to Rescale the inputs.If set to True,
inputs will be passed through a `Rescaling(1/255.0)` layer.
include_top: whether to include the fully-connected
layer at the top of the network.
width_coefficient: float, scaling coefficient for network width.
depth_coefficient: float, scaling coefficient for network depth.
default_size: integer, default input image size.
dropout_rate: float, dropout rate before final classifier layer.
drop_connect_rate: float, dropout rate at skip connections.
depth_divisor: integer, a unit of network width.
min_depth: integer, minimum number of filters.
bn_momentum: float. Momentum parameter for Batch Normalization layers.
activation: activation function.
blocks_args: list of dicts, parameters to construct block modules.
model_name: string, model name.
weights: one of `None` (random initialization),
or the path to the weights file to be loaded.
input_shape: optional shape tuple,
It should have exactly 3 inputs channels.
input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.
pooling: optional pooling mode for feature extraction
when `include_top` is `False`.
- `None` means that the output of the model will be
the 4D tensor output of the
last convolutional layer.
- `avg` means that global average pooling
will be applied to the output of the
last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will
be applied.
classes: optional number of classes to classify images
into, only to be specified if `include_top` is True, and
if no `weights` argument is specified.
classifier_activation: A `str` or callable. The activation function to use
on the "top" layer. Ignored unless `include_top=True`. Set
`classifier_activation=None` to return the logits of the "top" layer.
Returns:
A `keras.Model` instance.
Raises:
ValueError: in case of invalid argument for `weights`,
or invalid input shape.
ValueError: if `classifier_activation` is not `"softmax"` or `None` when
using a pretrained top layer.
"""
if blocks_args == "default":
blocks_args = DEFAULT_BLOCKS_ARGS[model_name]
if weights and not tf.io.gfile.exists(weights):
raise ValueError(
"The `weights` argument should be either `None` or the path to the "
"weights file to be loaded. Weights file not found at location: {weights}"
)
if include_top and not classes:
raise ValueError(
"If `include_top` is True, you should specify `classes`. "
f"Received: classes={classes}"
)
if include_top and pooling:
raise ValueError(
f"`pooling` must be `None` when `include_top=True`."
f"Received pooling={pooling} and include_top={include_top}. "
)
# Determine proper input shape
img_input = utils.parse_model_inputs(input_shape, input_tensor)
x = img_input
if include_rescaling:
x = layers.Rescaling(scale=1 / 255.0)(x)
# Build stem
stem_filters = round_filters(
filters=blocks_args[0]["input_filters"],
width_coefficient=width_coefficient,
min_depth=min_depth,
depth_divisor=depth_divisor,
)
x = layers.Conv2D(
filters=stem_filters,
kernel_size=3,
strides=2,
kernel_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
use_bias=False,
name="stem_conv",
)(x)
x = layers.BatchNormalization(
axis=BN_AXIS,
momentum=bn_momentum,
name="stem_bn",
)(x)
x = layers.Activation(activation, name="stem_activation")(x)
# Build blocks
blocks_args = copy.deepcopy(blocks_args)
b = 0
blocks = float(sum(args["num_repeat"] for args in blocks_args))
for (i, args) in enumerate(blocks_args):
assert args["num_repeat"] > 0
# Update block input and output filters based on depth multiplier.
args["input_filters"] = round_filters(
filters=args["input_filters"],
width_coefficient=width_coefficient,
min_depth=min_depth,
depth_divisor=depth_divisor,
)
args["output_filters"] = round_filters(
filters=args["output_filters"],
width_coefficient=width_coefficient,
min_depth=min_depth,
depth_divisor=depth_divisor,
)
# Determine which conv type to use:
block = {0: MBConvBlock, 1: FusedMBConvBlock}[args.pop("conv_type")]
repeats = round_repeats(
repeats=args.pop("num_repeat"), depth_coefficient=depth_coefficient
)
for j in range(repeats):
# The first block needs to take care of stride and filter size
# increase.
if j > 0:
args["strides"] = 1
args["input_filters"] = args["output_filters"]
x = block(
activation=activation,
bn_momentum=bn_momentum,
survival_probability=drop_connect_rate * b / blocks,
name="block{}{}_".format(i + 1, chr(j + 97)),
**args,
)(x)
b += 1
# Build top
top_filters = round_filters(
filters=1280,
width_coefficient=width_coefficient,
min_depth=min_depth,
depth_divisor=depth_divisor,
)
x = layers.Conv2D(
filters=top_filters,
kernel_size=1,
strides=1,
kernel_initializer=CONV_KERNEL_INITIALIZER,
padding="same",
data_format="channels_last",
use_bias=False,
name="top_conv",
)(x)
x = layers.BatchNormalization(
axis=BN_AXIS,
momentum=bn_momentum,
name="top_bn",
)(x)
x = layers.Activation(activation=activation, name="top_activation")(x)
if include_top:
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
if dropout_rate > 0:
x = layers.Dropout(dropout_rate, name="top_dropout")(x)
x = layers.Dense(
classes,
activation=classifier_activation,
kernel_initializer=DENSE_KERNEL_INITIALIZER,
bias_initializer=tf.constant_initializer(0),
name="predictions",
)(x)
else:
if pooling == "avg":
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
elif pooling == "max":
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
inputs = img_input
# Create model.
model = tf.keras.Model(inputs, x, **kwargs)
# Load weights.
if weights is not None:
model.load_weights(weights)
return model
def EfficientNetV2B0(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.0,
depth_coefficient=1.0,
default_size=224,
model_name="efficientnetv2-b0",
weights=parse_weights(weights, include_top, "efficientnetv2b0"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2B1(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.0,
depth_coefficient=1.1,
default_size=240,
model_name="efficientnetv2-b1",
weights=parse_weights(weights, include_top, "efficientnetv2b1"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2B2(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.1,
depth_coefficient=1.2,
default_size=260,
model_name="efficientnetv2-b2",
weights=parse_weights(weights, include_top, "efficientnetv2b2"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2B3(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.2,
depth_coefficient=1.4,
default_size=300,
model_name="efficientnetv2-b3",
weights=parse_weights(weights, include_top, "efficientnetv2b3"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2S(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.0,
depth_coefficient=1.0,
default_size=384,
model_name="efficientnetv2-s",
weights=parse_weights(weights, include_top, "efficientnetv2s"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2M(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.0,
depth_coefficient=1.0,
default_size=480,
model_name="efficientnetv2-m",
weights=parse_weights(weights, include_top, "efficientnetv2m"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
def EfficientNetV2L(
include_rescaling,
include_top,
weights=None,
input_shape=(None, None, 3),
input_tensor=None,
pooling=None,
classes=None,
classifier_activation="softmax",
**kwargs,
):
return EfficientNetV2(
include_rescaling=include_rescaling,
include_top=include_top,
width_coefficient=1.0,
depth_coefficient=1.0,
default_size=480,
model_name="efficientnetv2-l",
weights=parse_weights(weights, include_top, "efficientnetv2l"),
input_shape=input_shape,
input_tensor=input_tensor,
pooling=pooling,
classes=classes,
classifier_activation=classifier_activation,
**kwargs,
)
EfficientNetV2B0.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2B0")
EfficientNetV2B1.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2B1")
EfficientNetV2B2.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2B2")
EfficientNetV2B3.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2B3")
EfficientNetV2S.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2S")
EfficientNetV2M.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2M")
EfficientNetV2L.__doc__ = BASE_DOCSTRING.format(name="EfficientNetV2L")
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from absl.testing import parameterized
from keras_cv.models import efficientnet_v2
from .models_test import ModelsTest
MODEL_LIST = [
(efficientnet_v2.EfficientNetV2B0, 1280, {}),
(efficientnet_v2.EfficientNetV2B1, 1280, {}),
(efficientnet_v2.EfficientNetV2B2, 1408, {}),
(efficientnet_v2.EfficientNetV2B3, 1536, {}),
(efficientnet_v2.EfficientNetV2S, 1280, {}),
(efficientnet_v2.EfficientNetV2M, 1280, {}),
(efficientnet_v2.EfficientNetV2L, 1280, {}),
]
class EfficientNetV2Test(ModelsTest, tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(*MODEL_LIST)
def test_application_base(self, app, _, args):
super()._test_application_base(app, _, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_with_rescaling(self, app, last_dim, args):
super()._test_application_with_rescaling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_pooling(self, app, last_dim, args):
super()._test_application_pooling(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_application_variable_input_channels(self, app, last_dim, args):
super()._test_application_variable_input_channels(app, last_dim, args)
@parameterized.parameters(*MODEL_LIST)
def test_model_can_be_used_as_backbone(self, app, last_dim, args):
super()._test_model_can_be_used_as_backbone(app, last_dim, args)
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Stable Diffusion v1-4 Model Card
Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input.
For more information about how Stable Diffusion functions, please have a look at [KerasCV's tutorial covering StableDiffusion](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/).
The **Stable-Diffusion-v1-4** checkpoint was initialized with the weights of the [Stable-Diffusion-v1-2](https:/steps/huggingface.co/CompVis/stable-diffusion-v1-2)
checkpoint and subsequently fine-tuned on 225k steps at resolution 512x512 on "laion-aesthetics v2 5+" and 10% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
By loading this model you accept the CreativeML Open RAIL-M license at https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE
## Model Details
- **Developed by:** Robin Rombach, Patrick Esser
- **Model type:** Diffusion-based text-to-image generation model
- **Language(s):** English
- **License:** [The CreativeML OpenRAIL M license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which our license is based.
- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([CLIP ViT-L/14](https://arxiv.org/abs/2103.00020)) as suggested in the [Imagen paper](https://arxiv.org/abs/2205.11487).
- **Resources for more information:** [GitHub Repository](https://github.com/CompVis/stable-diffusion), [Paper](https://arxiv.org/abs/2112.10752).
- **Cite as:**
@InProceedings{Rombach_2022_CVPR,
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {10684-10695}
}
# Uses
## Direct Use
The model is intended for research purposes only. Possible research areas and
tasks include
- Safe deployment of models which have the potential to generate harmful content.
- Probing and understanding the limitations and biases of generative models.
- Generation of artworks and use in design and other artistic processes.
- Applications in educational or creative tools.
- Research on generative models.
Excluded uses are described below.
### Misuse, Malicious Use, and Out-of-Scope Use
_Note: This section is taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), but applies in the same way to Stable Diffusion v1_.
The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
#### Out-of-Scope Use
The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
#### Misuse and Malicious Use
Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
- Impersonating individuals without their consent.
- Sexual content without consent of the people who might see it.
- Mis- and disinformation
- Representations of egregious violence and gore
- Sharing of copyrighted or licensed material in violation of its terms of use.
- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
## Limitations and Bias
### Limitations
- The model does not achieve perfect photorealism
- The model cannot render legible text
- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
- Faces and people in general may not be generated properly.
- The model was trained mainly with English captions and will not work as well in other languages.
- The autoencoding part of the model is lossy
- The model was trained on a large-scale dataset
[LAION-5B](https://laion.ai/blog/laion-5b/) which contains adult material
and is not fit for product use without additional safety mechanisms and
considerations.
- No additional measures were used to deduplicate the dataset. As a result, we observe some degree of memorization for images that are duplicated in the training data.
The training data can be searched at [https://rom1504.github.io/clip-retrieval/](https://rom1504.github.io/clip-retrieval/) to possibly assist in the detection of memorized images.
### Bias
While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
which consists of images that are primarily limited to English descriptions.
Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
## More information
More information on StableDiffusion can be found in the [HuggingFace model card](https://huggingface.co/CompVis/stable-diffusion-v1-4)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from keras_cv.models.generative.stable_diffusion.stable_diffusion import StableDiffusion
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2022 The KerasCV Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from tensorflow import keras
from keras_cv.models.generative.stable_diffusion.__internal__.layers.group_normalization import (
GroupNormalization,
)
from keras_cv.models.generative.stable_diffusion.__internal__.layers.padded_conv2d import (
PaddedConv2D,
)
class AttentionBlock(keras.layers.Layer):
def __init__(self, output_dim, **kwargs):
super().__init__(**kwargs)
self.output_dim = output_dim
self.norm = GroupNormalization(epsilon=1e-5)
self.q = PaddedConv2D(output_dim, 1)
self.k = PaddedConv2D(output_dim, 1)
self.v = PaddedConv2D(output_dim, 1)
self.proj_out = PaddedConv2D(output_dim, 1)
def call(self, inputs):
x = self.norm(inputs)
q, k, v = self.q(x), self.k(x), self.v(x)
# Compute attention
_, h, w, c = q.shape
q = tf.reshape(q, (-1, h * w, c)) # b, hw, c
k = tf.transpose(k, (0, 3, 1, 2))
k = tf.reshape(k, (-1, c, h * w)) # b, c, hw
y = q @ k
y = y * (c**-0.5)
y = keras.activations.softmax(y)
# Attend to values
v = tf.transpose(v, (0, 3, 1, 2))
v = tf.reshape(v, (-1, c, h * w))
y = tf.transpose(y, (0, 2, 1))
x = v @ y
x = tf.transpose(x, (0, 2, 1))
x = tf.reshape(x, (-1, h, w, c))
return self.proj_out(x) + inputs
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
class GroupNormalization(tf.keras.layers.Layer):
"""GroupNormalization layer.
This layer is only here temporarily and will be removed
as we introduce GroupNormalization in core Keras.
"""
def __init__(
self,
groups=32,
axis=-1,
epsilon=1e-5,
**kwargs,
):
super().__init__(**kwargs)
self.groups = groups
self.axis = axis
self.epsilon = epsilon
def build(self, input_shape):
dim = input_shape[self.axis]
self.gamma = self.add_weight(
shape=(dim,),
name="gamma",
initializer="ones",
)
self.beta = self.add_weight(
shape=(dim,),
name="beta",
initializer="zeros",
)
def call(self, inputs):
input_shape = tf.shape(inputs)
reshaped_inputs = self._reshape_into_groups(inputs, input_shape)
normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
return tf.reshape(normalized_inputs, input_shape)
def _reshape_into_groups(self, inputs, input_shape):
group_shape = [input_shape[i] for i in range(inputs.shape.rank)]
group_shape[self.axis] = input_shape[self.axis] // self.groups
group_shape.insert(self.axis, self.groups)
group_shape = tf.stack(group_shape)
return tf.reshape(inputs, group_shape)
def _apply_normalization(self, reshaped_inputs, input_shape):
group_reduction_axes = list(range(1, reshaped_inputs.shape.rank))
axis = -2 if self.axis == -1 else self.axis - 1
group_reduction_axes.pop(axis)
mean, variance = tf.nn.moments(
reshaped_inputs, group_reduction_axes, keepdims=True
)
gamma, beta = self._get_reshaped_weights(input_shape)
return tf.nn.batch_normalization(
reshaped_inputs,
mean=mean,
variance=variance,
scale=gamma,
offset=beta,
variance_epsilon=self.epsilon,
)
def _get_reshaped_weights(self, input_shape):
broadcast_shape = self._create_broadcast_shape(input_shape)
gamma = tf.reshape(self.gamma, broadcast_shape)
beta = tf.reshape(self.beta, broadcast_shape)
return gamma, beta
def _create_broadcast_shape(self, input_shape):
broadcast_shape = [1] * len(input_shape)
broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
broadcast_shape.insert(self.axis, self.groups)
return broadcast_shape
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorflow import keras
class PaddedConv2D(keras.layers.Layer):
def __init__(self, filters, kernel_size, padding=0, strides=1, **kwargs):
super().__init__(**kwargs)
self.padding2d = keras.layers.ZeroPadding2D(padding)
self.conv2d = keras.layers.Conv2D(filters, kernel_size, strides=strides)
def call(self, inputs):
x = self.padding2d(inputs)
return self.conv2d(x)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorflow import keras
from keras_cv.models.generative.stable_diffusion.__internal__.layers.group_normalization import (
GroupNormalization,
)
from keras_cv.models.generative.stable_diffusion.__internal__.layers.padded_conv2d import (
PaddedConv2D,
)
class ResnetBlock(keras.layers.Layer):
def __init__(self, output_dim, **kwargs):
super().__init__(**kwargs)
self.output_dim = output_dim
self.norm1 = GroupNormalization(epsilon=1e-5)
self.conv1 = PaddedConv2D(output_dim, 3, padding=1)
self.norm2 = GroupNormalization(epsilon=1e-5)
self.conv2 = PaddedConv2D(output_dim, 3, padding=1)
def build(self, input_shape):
if input_shape[-1] != self.output_dim:
self.residual_projection = PaddedConv2D(self.output_dim, 1)
else:
self.residual_projection = lambda x: x
def call(self, inputs):
x = self.conv1(keras.activations.swish(self.norm1(inputs)))
x = self.conv2(keras.activations.swish(self.norm2(x)))
return x + self.residual_projection(inputs)
# Copyright 2022 The KerasCV Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This code is taken nearly verbatim from https://github.com/divamgupta/stable-diffusion-tensorflow."""
import gzip
import html
from functools import lru_cache
import regex as re
from tensorflow import keras
@lru_cache()
def bytes_to_unicode():
"""Return a list of utf-8 bytes and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = (
list(range(ord("!"), ord("~") + 1))
+ list(range(ord("¡"), ord("¬") + 1))
+ list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""Return set of symbol pairs in a word.
A word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def basic_clean(text):
text = html.unescape(html.unescape(text))
return text.strip()
def whitespace_clean(text):
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
class SimpleTokenizer:
def __init__(self, bpe_path=None):
bpe_path = bpe_path or keras.utils.get_file(
"bpe_simple_vocab_16e6.txt.gz",
"https://github.com/openai/CLIP/blob/main/clip/bpe_simple_vocab_16e6.txt.gz?raw=true",
file_hash="924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a",
)
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
merges = merges[1 : 49152 - 256 - 2 + 1]
merges = [tuple(merge.split()) for merge in merges]
vocab = list(bytes_to_unicode().values())
vocab = vocab + [v + "</w>" for v in vocab]
for merge in merges:
vocab.append("".join(merge))
vocab.extend(["<|startoftext|>", "<|endoftext|>"])
self.encoder = dict(zip(vocab, range(len(vocab))))
self.decoder = {v: k for k, v in self.encoder.items()}
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {
"<|startoftext|>": "<|startoftext|>",
"<|endoftext|>": "<|endoftext|>",
}
self.pat = re.compile(
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE,
)
@property
def end_of_text(self):
return self.encoder["<|endoftext|>"]
@property
def start_of_text(self):
return self.encoder["<|startoftext|>"]
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + "</w>",)
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def encode(self, text):
bpe_tokens = []
text = whitespace_clean(basic_clean(text)).lower()
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(
self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
)
return [self.start_of_text] + bpe_tokens + [self.end_of_text]
def decode(self, tokens):
text = "".join([self.decoder[token] for token in tokens])
text = (
bytearray([self.byte_decoder[c] for c in text])
.decode("utf-8", errors="replace")
.replace("</w>", " ")
)
return text
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment