"vscode:/vscode.git/clone" did not exist on "e6eba8419c733f83647007ddaf87f2de8ce5885b"
Unverified Commit 49ab1623 authored by Alara Dirik's avatar Alara Dirik Committed by GitHub
Browse files

Add EfficientNet (#21563)

* Add EfficientNet to transformers
parent c9a06714
......@@ -74,6 +74,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("dpr", "DPRConfig"),
("dpt", "DPTConfig"),
("efficientformer", "EfficientFormerConfig"),
("efficientnet", "EfficientNetConfig"),
("electra", "ElectraConfig"),
("encoder-decoder", "EncoderDecoderConfig"),
("ernie", "ErnieConfig"),
......@@ -248,6 +249,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("dpt", "DPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("efficientformer", "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("efficientnet", "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("ernie", "ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("ernie_m", "ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP"),
......@@ -417,6 +419,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("dpr", "DPR"),
("dpt", "DPT"),
("efficientformer", "EfficientFormer"),
("efficientnet", "EfficientNet"),
("electra", "ELECTRA"),
("encoder-decoder", "Encoder decoder"),
("ernie", "ERNIE"),
......
......@@ -57,6 +57,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("donut-swin", "DonutImageProcessor"),
("dpt", "DPTImageProcessor"),
("efficientformer", "EfficientFormerImageProcessor"),
("efficientnet", "EfficientNetImageProcessor"),
("flava", "FlavaImageProcessor"),
("git", "CLIPImageProcessor"),
("glpn", "GLPNImageProcessor"),
......
......@@ -73,6 +73,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("dpr", "DPRQuestionEncoder"),
("dpt", "DPTModel"),
("efficientformer", "EfficientFormerModel"),
("efficientnet", "EfficientNetModel"),
("electra", "ElectraModel"),
("ernie", "ErnieModel"),
("ernie_m", "ErnieMModel"),
......@@ -419,6 +420,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
"EfficientFormerForImageClassificationWithTeacher",
),
),
("efficientnet", "EfficientNetForImageClassification"),
("imagegpt", "ImageGPTForImageClassification"),
("levit", ("LevitForImageClassification", "LevitForImageClassificationWithTeacher")),
("mobilenet_v1", "MobileNetV1ForImageClassification"),
......@@ -933,6 +935,7 @@ MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
("bit", "BitBackbone"),
("convnext", "ConvNextBackbone"),
("dinat", "DinatBackbone"),
("efficientnet", "EfficientNetBackbone"),
("maskformer-swin", "MaskFormerSwinBackbone"),
("nat", "NatBackbone"),
("resnet", "ResNetBackbone"),
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
# rely on isort to merge the imports
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_efficientnet": [
"EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
"EfficientNetConfig",
"EfficientNetOnnxConfig",
]
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_efficientnet"] = ["EfficientNetImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_efficientnet"] = [
"EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"EfficientNetForImageClassification",
"EfficientNetModel",
"EfficientNetPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_efficientnet import (
EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
EfficientNetConfig,
EfficientNetOnnxConfig,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_efficientnet import EfficientNetImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_efficientnet import (
EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST,
EfficientNetForImageClassification,
EfficientNetModel,
EfficientNetPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2023 Google Research, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" EfficientNet model configuration"""
from collections import OrderedDict
from typing import List, Mapping
from packaging import version
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/efficientnet-b7": "https://huggingface.co/google/efficientnet-b7/resolve/main/config.json",
}
class EfficientNetConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`EfficientNetModel`]. It is used to instantiate an
EfficientNet model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the EfficientNet
[google/efficientnet-b7](https://huggingface.co/google/efficientnet-b7) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
image_size (`int`, *optional*, defaults to 600):
The input image size.
width_coefficient (`float`, *optional*, defaults to 2.0):
Scaling coefficient for network width at each stage.
depth_coefficient (`float`, *optional*, defaults to 3.1):
Scaling coefficient for network depth at each stage.
depth_divisor `int`, *optional*, defaults to 8):
A unit of network width.
kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
List of kernel sizes to be used in each block.
in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
List of input channel sizes to be used in each block for convolutional layers.
out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
List of output channel sizes to be used in each block for convolutional layers.
depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
List of block indices with square padding.
strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
List of stride sizes to be used in each block for convolutional layers.
num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
List of the number of times each block is to repeated.
expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
List of scaling coefficient of each block.
squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
Squeeze expansion ratio.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`,
`"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported.
hiddem_dim (`int`, *optional*, defaults to 1280):
The hidden dimension of the layer before the classification head.
pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`,
`"max"`]
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
batch_norm_eps (`float`, *optional*, defaults to 1e-3):
The epsilon used by the batch normalization layers.
batch_norm_momentum (`float`, *optional*, defaults to 0.99):
The momentum used by the batch normalization layers.
dropout_rate (`float`, *optional*, defaults to 0.5):
The dropout rate to be applied before final classifier layer.
drop_connect_rate (`float`, *optional*, defaults to 0.2):
The drop rate for skip connections.
Example:
```python
>>> from transformers import EfficientNetConfig, EfficientNetModel
>>> # Initializing a EfficientNet efficientnet-b7 style configuration
>>> configuration = EfficientNetConfig()
>>> # Initializing a model (with random weights) from the efficientnet-b7 style configuration
>>> model = EfficientNetModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "efficientnet"
def __init__(
self,
num_channels: int = 3,
image_size: int = 600,
width_coefficient: float = 2.0,
depth_coefficient: float = 3.1,
depth_divisor: int = 8,
kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
depthwise_padding: List[int] = [],
strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
squeeze_expansion_ratio: float = 0.25,
hidden_act: str = "swish",
hidden_dim: int = 2560,
pooling_type: str = "mean",
initializer_range: float = 0.02,
batch_norm_eps: float = 0.001,
batch_norm_momentum: float = 0.99,
dropout_rate: float = 0.5,
drop_connect_rate: float = 0.2,
**kwargs,
):
super().__init__(**kwargs)
self.num_channels = num_channels
self.image_size = image_size
self.width_coefficient = width_coefficient
self.depth_coefficient = depth_coefficient
self.depth_divisor = depth_divisor
self.kernel_sizes = kernel_sizes
self.in_channels = in_channels
self.out_channels = out_channels
self.depthwise_padding = depthwise_padding
self.strides = strides
self.num_block_repeats = num_block_repeats
self.expand_ratios = expand_ratios
self.squeeze_expansion_ratio = squeeze_expansion_ratio
self.hidden_act = hidden_act
self.hidden_dim = hidden_dim
self.pooling_type = pooling_type
self.initializer_range = initializer_range
self.batch_norm_eps = batch_norm_eps
self.batch_norm_momentum = batch_norm_momentum
self.dropout_rate = dropout_rate
self.drop_connect_rate = drop_connect_rate
self.num_hidden_layers = sum(num_block_repeats) * 4
class EfficientNetOnnxConfig(OnnxConfig):
torch_onnx_minimum_version = version.parse("1.11")
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-5
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert EfficientNet checkpoints from the original repository.
URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""
import argparse
import json
import os
import numpy as np
import PIL
import requests
import tensorflow.keras.applications.efficientnet as efficientnet
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from tensorflow.keras.preprocessing import image
from transformers import (
EfficientNetConfig,
EfficientNetForImageClassification,
EfficientNetImageProcessor,
)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
model_classes = {
"b0": efficientnet.EfficientNetB0,
"b1": efficientnet.EfficientNetB1,
"b2": efficientnet.EfficientNetB2,
"b3": efficientnet.EfficientNetB3,
"b4": efficientnet.EfficientNetB4,
"b5": efficientnet.EfficientNetB5,
"b6": efficientnet.EfficientNetB6,
"b7": efficientnet.EfficientNetB7,
}
CONFIG_MAP = {
"b0": {
"hidden_dim": 1280,
"width_coef": 1.0,
"depth_coef": 1.0,
"image_size": 224,
"dropout_rate": 0.2,
"dw_padding": [],
},
"b1": {
"hidden_dim": 1280,
"width_coef": 1.0,
"depth_coef": 1.1,
"image_size": 240,
"dropout_rate": 0.2,
"dw_padding": [16],
},
"b2": {
"hidden_dim": 1408,
"width_coef": 1.1,
"depth_coef": 1.2,
"image_size": 260,
"dropout_rate": 0.3,
"dw_padding": [5, 8, 16],
},
"b3": {
"hidden_dim": 1536,
"width_coef": 1.2,
"depth_coef": 1.4,
"image_size": 300,
"dropout_rate": 0.3,
"dw_padding": [5, 18],
},
"b4": {
"hidden_dim": 1792,
"width_coef": 1.4,
"depth_coef": 1.8,
"image_size": 380,
"dropout_rate": 0.4,
"dw_padding": [6],
},
"b5": {
"hidden_dim": 2048,
"width_coef": 1.6,
"depth_coef": 2.2,
"image_size": 456,
"dropout_rate": 0.4,
"dw_padding": [13, 27],
},
"b6": {
"hidden_dim": 2304,
"width_coef": 1.8,
"depth_coef": 2.6,
"image_size": 528,
"dropout_rate": 0.5,
"dw_padding": [31],
},
"b7": {
"hidden_dim": 2560,
"width_coef": 2.0,
"depth_coef": 3.1,
"image_size": 600,
"dropout_rate": 0.5,
"dw_padding": [18],
},
}
def get_efficientnet_config(model_name):
config = EfficientNetConfig()
config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
config.image_size = CONFIG_MAP[model_name]["image_size"]
config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
config.num_labels = 1000
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
def convert_image_processor(model_name):
size = CONFIG_MAP[model_name]["image_size"]
preprocessor = EfficientNetImageProcessor(
size={"height": size, "width": size},
image_mean=[0.485, 0.456, 0.406],
image_std=[0.47853944, 0.4732864, 0.47434163],
do_center_crop=False,
)
return preprocessor
# here we list all keys to be renamed (original name on the left, our name on the right)
def rename_keys(original_param_names):
block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
block_names = sorted(list(set(block_names)))
num_blocks = len(block_names)
block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
rename_keys = []
rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
for b in block_names:
hf_b = block_name_mapping[b]
rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
rename_keys.append(
(f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
)
rename_keys.append(
(f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
)
rename_keys.append(
(f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
)
rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
rename_keys.append(
(f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
)
rename_keys.append(
(f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
)
rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
rename_keys.append(
(f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
)
rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
rename_keys.append(
(f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
)
rename_keys.append(
(f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
)
rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))
key_mapping = {}
for item in rename_keys:
if item[0] in original_param_names:
key_mapping[item[0]] = "efficientnet." + item[1]
key_mapping["predictions/kernel:0"] = "classifier.weight"
key_mapping["predictions/bias:0"] = "classifier.bias"
return key_mapping
def replace_params(hf_params, tf_params, key_mapping):
for key, value in tf_params.items():
if "normalization" in key:
continue
hf_key = key_mapping[key]
if "_conv" in key and "kernel" in key:
new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
elif "depthwise_kernel" in key:
new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
elif "kernel" in key:
new_hf_value = torch.from_numpy(np.transpose(value))
else:
new_hf_value = torch.from_numpy(value)
# Replace HF parameters with original TF model parameters
assert hf_params[hf_key].shape == new_hf_value.shape
hf_params[hf_key].copy_(new_hf_value)
@torch.no_grad()
def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
"""
Copy/paste/tweak model's weights to our EfficientNet structure.
"""
# Load original model
original_model = model_classes[model_name](
include_top=True,
weights="imagenet",
input_tensor=None,
input_shape=None,
pooling=None,
classes=1000,
classifier_activation="softmax",
)
tf_params = original_model.trainable_variables
tf_non_train_params = original_model.non_trainable_variables
tf_params = {param.name: param.numpy() for param in tf_params}
for param in tf_non_train_params:
tf_params[param.name] = param.numpy()
tf_param_names = [k for k in tf_params.keys()]
# Load HuggingFace model
config = get_efficientnet_config(model_name)
hf_model = EfficientNetForImageClassification(config).eval()
hf_params = hf_model.state_dict()
# Create src-to-dst parameter name mapping dictionary
print("Converting parameters...")
key_mapping = rename_keys(tf_param_names)
replace_params(hf_params, tf_params, key_mapping)
# Initialize preprocessor and preprocess input image
preprocessor = convert_image_processor(model_name)
inputs = preprocessor(images=prepare_img(), return_tensors="pt")
# HF model inference
hf_model.eval()
with torch.no_grad():
outputs = hf_model(**inputs)
hf_logits = outputs.logits.detach().numpy()
# Original model inference
original_model.trainable = False
image_size = CONFIG_MAP[model_name]["image_size"]
img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
original_logits = original_model.predict(x)
# Check whether original and HF model outputs match -> np.allclose
assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
print("Model outputs match!")
if save_model:
# Create folder to save model
if not os.path.isdir(pytorch_dump_folder_path):
os.mkdir(pytorch_dump_folder_path)
# Save converted model and feature extractor
hf_model.save_pretrained(pytorch_dump_folder_path)
preprocessor.save_pretrained(pytorch_dump_folder_path)
if push_to_hub:
# Push model and feature extractor to hub
print(f"Pushing converted {model_name} to the hub...")
model_name = f"efficientnet-{model_name}"
preprocessor.push_to_hub(model_name)
hf_model.push_to_hub(model_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--model_name",
default="b0",
type=str,
help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default="hf_model",
type=str,
help="Path to the output PyTorch model directory.",
)
parser.add_argument("--save_model", action="store_true", help="Save model to local")
parser.add_argument("--push_to_hub", action="store_true", help="Push model and feature extractor to the hub")
args = parser.parse_args()
convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for EfficientNet."""
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class EfficientNetImageProcessor(BaseImageProcessor):
r"""
Constructs a EfficientNet image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_center_crop (`bool`, *optional*, defaults to `False`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
rescale_offset (`bool`, *optional*, defaults to `False`):
Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
overridden by the `rescale_factor` parameter in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
include_top (`bool`, *optional*, defaults to `True`):
Whether to rescale the image again. Should be set to True if the inputs are used for image classification.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PIL.Image.NEAREST,
do_center_crop: bool = False,
crop_size: Dict[str, int] = None,
rescale_factor: Union[int, float] = 1 / 255,
rescale_offset: bool = False,
do_rescale: bool = True,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
include_top: bool = True,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 346, "width": 346}
size = get_size_dict(size)
crop_size = crop_size if crop_size is not None else {"height": 289, "width": 289}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.rescale_offset = rescale_offset
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.include_top = include_top
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PIL.Image.NEAREST,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])` using the specified resampling filter.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
return resize(
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than
`crop_size` along any edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
offset: bool = True,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
offset (`bool`, *optional*):
Whether to scale the image in both negative and positive directions.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
if offset:
rescaled_image = (image - 127.5) * scale
if data_format is not None:
rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
rescaled_image = rescaled_image.astype(np.float32)
else:
rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
image_mean (`float` or `List[float]`):
Image mean.
image_std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample=None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
rescale_offset: bool = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
include_top: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
**kwargs,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after `resize`.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
PILImageResampling filter to use if resizing the image Only has an effect if `do_resize` is set to
`True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
padded with zeros and then cropped
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range].
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
include_top (`bool`, *optional*, defaults to `self.include_top`):
Rescales the image again for image classification if set to True.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- `None`: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
rescale_offset = rescale_offset if rescale_offset is not None else self.rescale_offset
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
include_top = include_top if include_top is not None else self.include_top
size = size if size is not None else self.size
size = get_size_dict(size)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size, param_name="crop_size")
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_center_crop and crop_size is None:
raise ValueError("Crop size must be specified if do_center_crop is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_center_crop:
images = [self.center_crop(image=image, size=crop_size) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor, offset=rescale_offset) for image in images]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
if include_top:
images = [self.normalize(image=image, mean=[0, 0, 0], std=image_std) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
This diff is collapsed.
......@@ -2317,6 +2317,30 @@ class EfficientFormerPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
class EfficientNetForImageClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class EfficientNetModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class EfficientNetPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -191,6 +191,13 @@ class EfficientFormerImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class EfficientNetImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class FlavaFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
......@@ -82,7 +82,6 @@ class ConvNextModelTester:
labels = ids_tensor([self.batch_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
......
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import EfficientNetImageProcessor
class EfficientNetImageProcessorTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=13,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
):
size = size if size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def prepare_image_processor_dict(self):
return {
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_normalize": self.do_normalize,
"do_resize": self.do_resize,
"size": self.size,
}
@require_torch
@require_vision
class EfficientNetImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase):
image_processing_class = EfficientNetImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = EfficientNetImageProcessorTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch EfficientNet model. """
import inspect
import unittest
from transformers import EfficientNetConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
from transformers import EfficientNetForImageClassification, EfficientNetModel
from transformers.models.efficientnet.modeling_efficientnet import EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor
class EfficientNetModelTester:
def __init__(
self,
parent,
batch_size=13,
image_size=32,
num_channels=3,
kernel_sizes=[3, 3, 5],
in_channels=[32, 16, 24],
out_channels=[16, 24, 40],
strides=[1, 1, 2],
num_block_repeats=[1, 1, 2],
expand_ratios=[1, 6, 6],
is_training=True,
use_labels=True,
intermediate_size=37,
hidden_act="gelu",
num_labels=10,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.num_channels = num_channels
self.kernel_sizes = kernel_sizes
self.in_channels = in_channels
self.out_channels = out_channels
self.strides = strides
self.num_block_repeats = num_block_repeats
self.expand_ratios = expand_ratios
self.is_training = is_training
self.hidden_act = hidden_act
self.num_labels = num_labels
self.use_labels = use_labels
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.num_labels)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return EfficientNetConfig(
num_channels=self.num_channels,
kernel_sizes=self.kernel_sizes,
in_channels=self.in_channels,
out_channels=self.out_channels,
strides=self.strides,
num_block_repeats=self.num_block_repeats,
expand_ratios=self.expand_ratios,
hidden_act=self.hidden_act,
num_labels=self.num_labels,
)
def create_and_check_model(self, config, pixel_values, labels):
model = EfficientNetModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
# expected last hidden states: B, C, H // 4, W // 4
self.parent.assertEqual(
result.last_hidden_state.shape,
(self.batch_size, config.hidden_dim, self.image_size // 4, self.image_size // 4),
)
def create_and_check_for_image_classification(self, config, pixel_values, labels):
model = EfficientNetForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class EfficientNetModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as EfficientNet does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (EfficientNetModel, EfficientNetForImageClassification) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
has_attentions = False
def setUp(self):
self.model_tester = EfficientNetModelTester(self)
self.config_tester = ConfigTester(
self, config_class=EfficientNetConfig, has_text_modality=False, hidden_size=37
)
def test_config(self):
self.create_and_test_config_common_properties()
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
self.config_tester.check_config_arguments_init()
def create_and_test_config_common_properties(self):
return
@unittest.skip(reason="EfficientNet does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="EfficientNet does not support input and output embeddings")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="EfficientNet does not use feedforward chunking")
def test_feed_forward_chunking(self):
pass
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
num_blocks = sum(config.num_block_repeats) * 4
self.assertEqual(len(hidden_states), num_blocks)
# EfficientNet's feature maps are of shape (batch_size, num_channels, height, width)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[self.model_tester.image_size // 2, self.model_tester.image_size // 2],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in EFFICIENTNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = EfficientNetModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torch
@require_vision
class EfficientNetModelIntegrationTest(unittest.TestCase):
@cached_property
def default_image_processor(self):
return AutoImageProcessor.from_pretrained("google/efficientnet-b7") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = EfficientNetForImageClassification.from_pretrained("google/efficientnet-b7").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([0.0001, 0.0002, 0.0002]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment