"tests/models/codegen/test_tokenization_codegen.py" did not exist on "c824d15aa1590ddb5d2fc977a8a1009a4b1d7262"
Unverified Commit 5f789a68 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add LayoutXLMProcessor (and LayoutXLMTokenizer, LayoutXLMTokenizerFast) (#14115)



* Add LayoutXLMTokenizer and LayoutXLMTokenizerFast

* Fix styling issues

* Fix more styling issues

* Fix more styling issues

* Fix docstring

* Fix unit tests

* Fix docs

* Fix unit tests

* Fix typos and styling issues

* Fix styling issues

* Fix docstring

* Make all tests of test_tokenization_layoutxlm pass

* Add LayoutXLMProcessor

* Make fixup

* Make all LayoutXLMProcessor tests pass

* Minor fixes

* Leave LayoutLMv2Processor tests unchanged

* Fix code quality

* Move LayoutXLM tokenizers and processor to separate folder

* Fix code quality

* Apply suggestions from code review

* Replace assertions by value errors

* Remove methods from fast tokenizer
Co-authored-by: default avatarKing Yiu Suen <kingyiusuen@gmail.com>
parent 558f8543
...@@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like ...@@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can Note that LayoutXLM has its own tokenizer, based on
initialize it as follows: :class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
follows:
.. code-block:: .. code-block::
from transformers import AutoTokenizer from transformers import LayoutXLMTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies
:class:`~transformers.LayoutLMv2FeatureExtractor` and
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all
data for the model.
As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
<layoutlmv2>` for all tips, code examples and notebooks. <layoutlmv2>` for all tips, code examples and notebooks.
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
<https://github.com/microsoft/unilm>`__. <https://github.com/microsoft/unilm>`__.
LayoutXLMTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMTokenizer
:members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
LayoutXLMTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMTokenizerFast
:members: __call__
LayoutXLMProcessor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LayoutXLMProcessor
:members: __call__
...@@ -229,6 +229,7 @@ _import_structure = { ...@@ -229,6 +229,7 @@ _import_structure = {
"LayoutLMv2Processor", "LayoutLMv2Processor",
"LayoutLMv2Tokenizer", "LayoutLMv2Tokenizer",
], ],
"models.layoutxlm": ["LayoutXLMProcessor"],
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"], "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"], "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
"models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"], "models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
...@@ -365,6 +366,7 @@ if is_sentencepiece_available(): ...@@ -365,6 +366,7 @@ if is_sentencepiece_available():
_import_structure["models.big_bird"].append("BigBirdTokenizer") _import_structure["models.big_bird"].append("BigBirdTokenizer")
_import_structure["models.camembert"].append("CamembertTokenizer") _import_structure["models.camembert"].append("CamembertTokenizer")
_import_structure["models.deberta_v2"].append("DebertaV2Tokenizer") _import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizer")
_import_structure["models.m2m_100"].append("M2M100Tokenizer") _import_structure["models.m2m_100"].append("M2M100Tokenizer")
_import_structure["models.marian"].append("MarianTokenizer") _import_structure["models.marian"].append("MarianTokenizer")
_import_structure["models.mbart"].append("MBartTokenizer") _import_structure["models.mbart"].append("MBartTokenizer")
...@@ -411,6 +413,7 @@ if is_tokenizers_available(): ...@@ -411,6 +413,7 @@ if is_tokenizers_available():
_import_structure["models.herbert"].append("HerbertTokenizerFast") _import_structure["models.herbert"].append("HerbertTokenizerFast")
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast") _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast") _import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast")
_import_structure["models.led"].append("LEDTokenizerFast") _import_structure["models.led"].append("LEDTokenizerFast")
_import_structure["models.longformer"].append("LongformerTokenizerFast") _import_structure["models.longformer"].append("LongformerTokenizerFast")
_import_structure["models.lxmert"].append("LxmertTokenizerFast") _import_structure["models.lxmert"].append("LxmertTokenizerFast")
...@@ -477,6 +480,7 @@ if is_vision_available(): ...@@ -477,6 +480,7 @@ if is_vision_available():
_import_structure["models.detr"].append("DetrFeatureExtractor") _import_structure["models.detr"].append("DetrFeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor") _import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2Processor") _import_structure["models.layoutlmv2"].append("LayoutLMv2Processor")
_import_structure["models.layoutxlm"].append("LayoutXLMProcessor")
_import_structure["models.segformer"].append("SegformerFeatureExtractor") _import_structure["models.segformer"].append("SegformerFeatureExtractor")
_import_structure["models.vit"].append("ViTFeatureExtractor") _import_structure["models.vit"].append("ViTFeatureExtractor")
else: else:
...@@ -2140,6 +2144,7 @@ if TYPE_CHECKING: ...@@ -2140,6 +2144,7 @@ if TYPE_CHECKING:
LayoutLMv2Processor, LayoutLMv2Processor,
LayoutLMv2Tokenizer, LayoutLMv2Tokenizer,
) )
from .models.layoutxlm import LayoutXLMProcessor
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
...@@ -2266,6 +2271,7 @@ if TYPE_CHECKING: ...@@ -2266,6 +2271,7 @@ if TYPE_CHECKING:
from .models.big_bird import BigBirdTokenizer from .models.big_bird import BigBirdTokenizer
from .models.camembert import CamembertTokenizer from .models.camembert import CamembertTokenizer
from .models.deberta_v2 import DebertaV2Tokenizer from .models.deberta_v2 import DebertaV2Tokenizer
from .models.layoutxlm import LayoutXLMTokenizer
from .models.m2m_100 import M2M100Tokenizer from .models.m2m_100 import M2M100Tokenizer
from .models.marian import MarianTokenizer from .models.marian import MarianTokenizer
from .models.mbart import MBart50Tokenizer, MBartTokenizer from .models.mbart import MBart50Tokenizer, MBartTokenizer
...@@ -2302,6 +2308,7 @@ if TYPE_CHECKING: ...@@ -2302,6 +2308,7 @@ if TYPE_CHECKING:
from .models.herbert import HerbertTokenizerFast from .models.herbert import HerbertTokenizerFast
from .models.layoutlm import LayoutLMTokenizerFast from .models.layoutlm import LayoutLMTokenizerFast
from .models.layoutlmv2 import LayoutLMv2TokenizerFast from .models.layoutlmv2 import LayoutLMv2TokenizerFast
from .models.layoutxlm import LayoutXLMTokenizerFast
from .models.led import LEDTokenizerFast from .models.led import LEDTokenizerFast
from .models.longformer import LongformerTokenizerFast from .models.longformer import LongformerTokenizerFast
from .models.lxmert import LxmertTokenizerFast from .models.lxmert import LxmertTokenizerFast
...@@ -2349,6 +2356,7 @@ if TYPE_CHECKING: ...@@ -2349,6 +2356,7 @@ if TYPE_CHECKING:
from .models.deit import DeiTFeatureExtractor from .models.deit import DeiTFeatureExtractor
from .models.detr import DetrFeatureExtractor from .models.detr import DetrFeatureExtractor
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
from .models.layoutxlm import LayoutXLMProcessor
from .models.segformer import SegformerFeatureExtractor from .models.segformer import SegformerFeatureExtractor
from .models.vit import ViTFeatureExtractor from .models.vit import ViTFeatureExtractor
else: else:
......
...@@ -944,6 +944,7 @@ SLOW_TO_FAST_CONVERTERS = { ...@@ -944,6 +944,7 @@ SLOW_TO_FAST_CONVERTERS = {
"HerbertTokenizer": HerbertConverter, "HerbertTokenizer": HerbertConverter,
"LayoutLMTokenizer": BertConverter, "LayoutLMTokenizer": BertConverter,
"LayoutLMv2Tokenizer": BertConverter, "LayoutLMv2Tokenizer": BertConverter,
"LayoutXLMTokenizer": XLMRobertaConverter,
"LongformerTokenizer": RobertaConverter, "LongformerTokenizer": RobertaConverter,
"LEDTokenizer": RobertaConverter, "LEDTokenizer": RobertaConverter,
"LxmertTokenizer": BertConverter, "LxmertTokenizer": BertConverter,
......
...@@ -59,6 +59,7 @@ from . import ( ...@@ -59,6 +59,7 @@ from . import (
ibert, ibert,
layoutlm, layoutlm,
layoutlmv2, layoutlmv2,
layoutxlm,
led, led,
longformer, longformer,
luke, luke,
......
...@@ -124,6 +124,7 @@ else: ...@@ -124,6 +124,7 @@ else:
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
( (
"dpr", "dpr",
( (
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import (
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)
_import_structure = {}
if is_sentencepiece_available():
_import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]
if is_tokenizers_available():
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
if is_vision_available():
_import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"]
if TYPE_CHECKING:
if is_sentencepiece_available():
from .tokenization_layoutxlm import LayoutXLMTokenizer
if is_tokenizers_available():
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
if is_vision_available():
from .processing_layoutlmv2 import LayoutXLMProcessor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutXLM.
"""
from typing import List, Optional, Union
from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
from ...file_utils import TensorType
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .tokenization_layoutxlm import LayoutXLMTokenizer
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
class LayoutXLMProcessor:
r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor.
:class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model.
It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
:class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words
and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
input.
tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`):
An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`.
The tokenizer is a required input.
"""
def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def save_pretrained(self, save_directory):
"""
Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``,
so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method.
.. note::
This class method is simply calling
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
save_directory (:obj:`str` or :obj:`os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor.
.. note::
This class method is simply calling Layoutv2FeatureExtractor's
:meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
Please refer to the docstrings of the methods above for more information.
Args:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
This can be either:
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a feature extractor file saved using the
:meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
``./my_model_directory/``.
- a path or url to a saved feature extractor JSON `file`, e.g.,
``./my_model_directory/preprocessor_config.json``.
use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
:class:`~transformers.PreTrainedTokenizer`
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchEncoding:
"""
This method first forwards the :obj:`images` argument to
:meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together
with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the
output, together with resized :obj:`images`.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels "
"if you initialized the feature extractor with apply_ocr set to True."
)
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
text=text if text is not None else features["words"],
text_pair=text_pair if text_pair is not None else None,
boxes=boxes if boxes is not None else features["boxes"],
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
# add pixel values
encoded_inputs["image"] = features.pop("pixel_values")
return encoded_inputs
This diff is collapsed.
...@@ -65,6 +65,15 @@ class DebertaV2Tokenizer: ...@@ -65,6 +65,15 @@ class DebertaV2Tokenizer:
requires_backends(cls, ["sentencepiece"]) requires_backends(cls, ["sentencepiece"])
class LayoutXLMTokenizer:
def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["sentencepiece"])
class M2M100Tokenizer: class M2M100Tokenizer:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"]) requires_backends(self, ["sentencepiece"])
......
...@@ -200,6 +200,15 @@ class LayoutLMv2TokenizerFast: ...@@ -200,6 +200,15 @@ class LayoutLMv2TokenizerFast:
requires_backends(cls, ["tokenizers"]) requires_backends(cls, ["tokenizers"])
class LayoutXLMTokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tokenizers"])
class LEDTokenizerFast: class LEDTokenizerFast:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"]) requires_backends(self, ["tokenizers"])
......
...@@ -50,6 +50,15 @@ class LayoutLMv2Processor: ...@@ -50,6 +50,15 @@ class LayoutLMv2Processor:
requires_backends(cls, ["vision"]) requires_backends(cls, ["vision"])
class LayoutXLMProcessor:
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["vision"])
class SegformerFeatureExtractor: class SegformerFeatureExtractor:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment