Unverified Commit 31ee80d5 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add LayoutLMv3 (#17060)



* Make forward pass work

* More improvements

* Remove unused imports

* Remove timm dependency

* Improve loss calculation of token classifier

* Fix most tests

* Add docs

* Add model integration test

* Make all tests pass

* Add LayoutLMv3FeatureExtractor

* Improve integration test + make fixup

* Add example script

* Fix style

* Add LayoutLMv3Processor

* Fix style

* Add option to add visual labels

* Make more tokenizer tests pass

* Fix more tests

* Make more tests pass

* Fix bug and improve docs

* Fix import of processors

* Improve docstrings

* Fix toctree and improve docs

* Fix auto tokenizer

* Move tests to model folder

* Move tests to model folder

* change default behavior add_prefix_space

* add prefix space for fast

* add_prefix_spcae set to True for Fast

* no space before `unique_no_split` token

* add test to hightligh special treatment of added tokens

* fix `test_batch_encode_dynamic_overflowing` by building a long enough example

* fix `test_full_tokenizer` with add_prefix_token

* Fix tokenizer integration test

* Make the code more readable

* Add tests for LayoutLMv3Processor

* Fix style

* Add model to README and update init

* Apply suggestions from code review

* Replace asserts by value errors

* Add suggestion by @ducviet00

* Add model to doc tests

* Simplify script

* Improve README

* a step ahead to fix

* Update pair_input_test

* Make all tokenizer tests pass - phew

* Make style

* Add LayoutLMv3 to CI job

* Fix auto mapping

* Fix CI job name

* Make all processor tests pass

* Make tests of LayoutLMv2 and LayoutXLM consistent

* Add copied from statements to fast tokenizer

* Add copied from statements to slow tokenizer

* Remove add_visual_labels attribute

* Fix tests

* Add link to notebooks

* Improve docs of LayoutLMv3Processor

* Fix reference to section
Co-authored-by: default avatarSaulLu <lucilesaul.com@gmail.com>
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent 13541b4a
......@@ -144,3 +144,17 @@ class LayoutLMv2Processor(ProcessorMixin):
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
......@@ -499,15 +499,17 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
words = text if text_pair is None else text_pair
assert boxes is not None, "You must provide corresponding bounding boxes"
if boxes is None:
raise ValueError("You must provide corresponding bounding boxes")
if is_batched:
assert len(words) == len(boxes), "You must provide words and boxes for an equal amount of examples"
if len(words) != len(boxes):
raise ValueError("You must provide words and boxes for an equal amount of examples")
for words_example, boxes_example in zip(words, boxes):
assert len(words_example) == len(
boxes_example
), "You must provide as many words as there are bounding boxes"
if len(words_example) != len(boxes_example):
raise ValueError("You must provide as many words as there are bounding boxes")
else:
assert len(words) == len(boxes), "You must provide as many words as there are bounding boxes"
if len(words) != len(boxes):
raise ValueError("You must provide as many words as there are bounding boxes")
if is_batched:
if text_pair is not None and len(text) != len(text_pair):
......
......@@ -260,15 +260,17 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
words = text if text_pair is None else text_pair
assert boxes is not None, "You must provide corresponding bounding boxes"
if boxes is None:
raise ValueError("You must provide corresponding bounding boxes")
if is_batched:
assert len(words) == len(boxes), "You must provide words and boxes for an equal amount of examples"
if len(words) != len(boxes):
raise ValueError("You must provide words and boxes for an equal amount of examples")
for words_example, boxes_example in zip(words, boxes):
assert len(words_example) == len(
boxes_example
), "You must provide as many words as there are bounding boxes"
if len(words_example) != len(boxes_example):
raise ValueError("You must provide as many words as there are bounding boxes")
else:
assert len(words) == len(boxes), "You must provide as many words as there are bounding boxes"
if len(words) != len(boxes):
raise ValueError("You must provide as many words as there are bounding boxes")
if is_batched:
if text_pair is not None and len(text) != len(text_pair):
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_layoutlmv3": ["LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv3Config"],
"processing_layoutlmv3": ["LayoutLMv3Processor"],
"tokenization_layoutlmv3": ["LayoutLMv3Tokenizer"],
}
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["tokenization_layoutlmv3_fast"] = ["LayoutLMv3TokenizerFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_layoutlmv3"] = [
"LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST",
"LayoutLMv3ForQuestionAnswering",
"LayoutLMv3ForSequenceClassification",
"LayoutLMv3ForTokenClassification",
"LayoutLMv3Model",
"LayoutLMv3PreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_layoutlmv3"] = ["LayoutLMv3FeatureExtractor"]
if TYPE_CHECKING:
from .configuration_layoutlmv3 import LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv3Config
from .processing_layoutlmv3 import LayoutLMv3Processor
from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_layoutlmv3 import (
LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST,
LayoutLMv3ForQuestionAnswering,
LayoutLMv3ForSequenceClassification,
LayoutLMv3ForTokenClassification,
LayoutLMv3Model,
LayoutLMv3PreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_layoutlmv3 import LayoutLMv3FeatureExtractor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LayoutLMv3 model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
}
class LayoutLMv3Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an
LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the LayoutLMv3
[microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the LayoutLMv3 model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling [`LayoutLMv3Model`].
hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv3Model`].
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum value that the 2D position embedding might ever be used with. Typically set this to something
large just in case (e.g., 1024).
coordinate_size (`int`, *optional*, defaults to `128`):
Dimension of the coordinate embeddings.
shape_size (`int`, *optional*, defaults to `128`):
Dimension of the width and height embeddings.
has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
Whether or not to use a relative attention bias in the self-attention mechanism.
rel_pos_bins (`int`, *optional*, defaults to 32):
The number of relative position bins to be used in the self-attention mechanism.
max_rel_pos (`int`, *optional*, defaults to 128):
The maximum number of relative positions to be used in the self-attention mechanism.
max_rel_2d_pos (`int`, *optional*, defaults to 256):
The maximum number of relative 2D positions in the self-attention mechanism.
rel_2d_pos_bins (`int`, *optional*, defaults to 64):
The number of 2D relative position bins in the self-attention mechanism.
has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
Whether or not to use a spatial attention bias in the self-attention mechanism.
visual_embed (`bool`, *optional*, defaults to `True`):
Whether or not to add patch embeddings.
input_size (`int`, *optional*, defaults to `224`):
The size (resolution) of the images.
num_channels (`int`, *optional*, defaults to `3`):
The number of channels of the images.
patch_size (`int`, *optional*, defaults to `16`)
The size (resolution) of the patches.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
Example:
```python
>>> from transformers import LayoutLMv3Model, LayoutLMv3Config
>>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
>>> configuration = LayoutLMv3Config()
>>> # Initializing a model from the microsoft/layoutlmv3-base style configuration
>>> model = LayoutLMv3Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "layoutlmv3"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-5,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
max_2d_position_embeddings=1024,
coordinate_size=128,
shape_size=128,
has_relative_attention_bias=True,
rel_pos_bins=32,
max_rel_pos=128,
rel_2d_pos_bins=64,
max_rel_2d_pos=256,
has_spatial_attention_bias=True,
text_embed=True,
visual_embed=True,
input_size=224,
num_channels=3,
patch_size=16,
classifier_dropout=None,
**kwargs
):
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
layer_norm_eps=layer_norm_eps,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
self.max_2d_position_embeddings = max_2d_position_embeddings
self.coordinate_size = coordinate_size
self.shape_size = shape_size
self.has_relative_attention_bias = has_relative_attention_bias
self.rel_pos_bins = rel_pos_bins
self.max_rel_pos = max_rel_pos
self.has_spatial_attention_bias = has_spatial_attention_bias
self.rel_2d_pos_bins = rel_2d_pos_bins
self.max_rel_2d_pos = max_rel_2d_pos
self.text_embed = text_embed
self.visual_embed = visual_embed
self.input_size = input_size
self.num_channels = num_channels
self.patch_size = patch_size
self.classifier_dropout = classifier_dropout
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for LayoutLMv3.
"""
from typing import List, Optional, Union
import numpy as np
from PIL import Image
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageFeatureExtractionMixin, is_torch_tensor
from ...utils import TensorType, is_pytesseract_available, logging, requires_backends
# soft dependency
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__)
ImageInput = Union[
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
]
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: Image.Image, lang: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
# turn coordinates into (left, top, left+width, top+height) format
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
image_width, image_height = image.size
# finally, normalize the bounding boxes
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a LayoutLMv3 feature extractor. This can be used to resize + normalize document images, as well as to
apply OCR on them in order to get a list of words and normalized bounding boxes.
This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
of the main methods. Users should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
`PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
if `do_resize` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`Optional[str]`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
<Tip>
LayoutLMv3FeatureExtractor uses Google's Tesseract OCR engine under the hood.
</Tip>"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=224,
resample=Image.BILINEAR,
do_normalize=True,
image_mean=None,
image_std=None,
apply_ocr=True,
ocr_lang=None,
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **words** -- Optional words as identified by Tesseract OCR (only when [`LayoutLMv3FeatureExtractor`] was
initialized with `apply_ocr` set to `True`).
- **boxes** -- Optional bounding boxes as identified by Tesseract OCR, normalized based on the image size
(only when [`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`).
Examples:
```python
>>> from transformers import LayoutLMv3FeatureExtractor
>>> from PIL import Image
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
>>> # option 1: with apply_ocr=True (default)
>>> feature_extractor = LayoutLMv3FeatureExtractor()
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values', 'words', 'boxes'])
>>> # option 2: with apply_ocr=False
>>> feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values'])
```"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples), "
f"but is of type {type(images)}."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# Tesseract OCR to get words + normalized bounding boxes
if self.apply_ocr:
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
words_batch.append(words)
boxes_batch.append(boxes)
# transformations (resizing + normalization)
if self.do_resize and self.size is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if self.apply_ocr:
encoded_inputs["words"] = words_batch
encoded_inputs["boxes"] = boxes_batch
return encoded_inputs
This diff is collapsed.
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutLMv3.
"""
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LayoutLMv3Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 feature extractor and a LayoutLMv3 tokenizer into a
single processor.
[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv3FeatureExtractor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (`LayoutLMv3FeatureExtractor`):
An instance of [`LayoutLMv3FeatureExtractor`]. The feature extractor is a required input.
tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`):
An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "LayoutLMv3FeatureExtractor"
tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchEncoding:
"""
This method first forwards the `images` argument to [`~LayoutLMv3FeatureExtractor.__call__`]. In case
[`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
together with resized and normalized `pixel_values`. In case [`LayoutLMv3FeatureExtractor`] was initialized
with `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user
along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
resized and normalized `pixel_values`.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
)
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
text=text if text is not None else features["words"],
text_pair=text_pair if text_pair is not None else None,
boxes=boxes if boxes is not None else features["boxes"],
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
# add pixel values
images = features.pop("pixel_values")
if return_overflowing_tokens is True:
images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
encoded_inputs["pixel_values"] = images
return encoded_inputs
def get_overflowing_images(self, images, overflow_to_sample_mapping):
# in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
This diff is collapsed.
......@@ -28,7 +28,9 @@ from ...utils import (
)
_import_structure = {}
_import_structure = {
"processing_layoutxlm": ["LayoutXLMProcessor"],
}
try:
if not is_sentencepiece_available():
......@@ -46,15 +48,9 @@ except OptionalDependencyNotAvailable:
else:
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"]
if TYPE_CHECKING:
from .processing_layoutxlm import LayoutXLMProcessor
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
......@@ -71,14 +67,6 @@ if TYPE_CHECKING:
else:
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .processing_layoutlmv2 import LayoutXLMProcessor
else:
import sys
......
......@@ -121,6 +121,37 @@ class LayoutXLMProcessor(ProcessorMixin):
)
# add pixel values
encoded_inputs["image"] = features.pop("pixel_values")
images = features.pop("pixel_values")
if return_overflowing_tokens is True:
images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
encoded_inputs["image"] = images
return encoded_inputs
def get_overflowing_images(self, images, overflow_to_sample_mapping):
# in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
......@@ -2367,6 +2367,44 @@ class LayoutLMv2PreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
LAYOUTLMV3_PRETRAINED_MODEL_ARCHIVE_LIST = None
class LayoutLMv3ForQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LayoutLMv3ForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LayoutLMv3ForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LayoutLMv3Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class LayoutLMv3PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
LED_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -171,6 +171,13 @@ class LayoutLMv2TokenizerFast(metaclass=DummyObject):
requires_backends(self, ["tokenizers"])
class LayoutLMv3TokenizerFast(metaclass=DummyObject):
_backends = ["tokenizers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
class LayoutXLMTokenizerFast(metaclass=DummyObject):
_backends = ["tokenizers"]
......
......@@ -94,14 +94,7 @@ class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class LayoutLMv2Processor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class LayoutXLMProcessor(metaclass=DummyObject):
class LayoutLMv3FeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
......
......@@ -215,10 +215,11 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
......@@ -236,10 +237,11 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] 7 itc limited report and accounts 2013 itc ’ s brands : an asset for the nation the consumer needs and aspirations they fulfil, the benefit they generate for millions across itc ’ s value chains, the future - ready capabilities that support them, and the value that they create for the country, have made itc ’ s brands national assets, adding to india ’ s competitiveness. it is itc ’ s aspiration to be the no 1 fmcg player in the country, driven by its new fmcg businesses. a recent nielsen report has highlighted that itc's new fmcg businesses are the fastest growing among the top consumer goods companies operating in india. itc takes justifiable pride that, along with generating economic value, these celebrated indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. di wills * ; love delightfully soft skin? aia ans source : https : / / www. industrydocuments. ucsf. edu / docs / snbx0223 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
......@@ -266,7 +268,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] hello world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
......@@ -281,7 +283,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
......@@ -320,7 +322,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] weirdly world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
......@@ -342,7 +344,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] my name is niels [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
......@@ -382,10 +384,11 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] what's his name? [SEP] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
......@@ -400,8 +403,9 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
expected_decoding = "[CLS] what's the time [SEP] 7 itc limited report and accounts 2013 itc ’ s [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
......@@ -434,7 +438,7 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] what's his name? [SEP] hello world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
......@@ -450,11 +454,11 @@ class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
# verify input_ids
expected_decoding = "[CLS] how old is he? [SEP] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "[CLS] what's the time [SEP] my name is niels [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.testing_utils import require_pytesseract, require_torch
from transformers.utils import is_pytesseract_available, is_torch_available
from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv3FeatureExtractor
class LayoutLMv3FeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=18,
apply_ocr=True,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.apply_ocr = apply_ocr
def prepare_feat_extract_dict(self):
return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
@require_torch
@require_pytesseract
class LayoutLMv3FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
feature_extraction_class = LayoutLMv3FeatureExtractor if is_pytesseract_available() else None
def setUp(self):
self.feature_extract_tester = LayoutLMv3FeatureExtractionTester(self)
@property
def feat_extract_dict(self):
return self.feature_extract_tester.prepare_feat_extract_dict()
def test_feat_extract_properties(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
self.assertTrue(hasattr(feature_extractor, "do_resize"))
self.assertTrue(hasattr(feature_extractor, "size"))
self.assertTrue(hasattr(feature_extractor, "apply_ocr"))
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoding = feature_extractor(image_inputs[0], return_tensors="pt")
self.assertEqual(
encoding.pixel_values.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
self.assertIsInstance(encoding.words, list)
self.assertIsInstance(encoding.boxes, list)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_numpy(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_pytorch(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_LayoutLMv3_integration_test(self):
# with apply_OCR = True
feature_extractor = LayoutLMv3FeatureExtractor()
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
image = Image.open(ds[0]["file"]).convert("RGB")
encoding = feature_extractor(image, return_tensors="pt")
self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
self.assertEqual(len(encoding.words), len(encoding.boxes))
# fmt: off
# the words and boxes were obtained with Tesseract 4.1.1
expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231
expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231
# fmt: on
self.assertListEqual(encoding.words, expected_words)
self.assertListEqual(encoding.boxes, expected_boxes)
# with apply_OCR = False
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
encoding = feature_extractor(image, return_tensors="pt")
self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment