Unverified Commit b6ddb08a authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add LayoutLMv2 + LayoutXLM (#12604)



* First commit

* Make style

* Fix dummy objects

* Add Detectron2 config

* Add LayoutLMv2 pooler

* More improvements, add documentation

* More improvements

* Add model tests

* Add clarification regarding image input

* Improve integration test

* Fix bug

* Fix another bug

* Fix another bug

* Fix another bug

* More improvements

* Make more tests pass

* Make more tests pass

* Improve integration test

* Remove gradient checkpointing and add head masking

* Add integration test

* Add LayoutLMv2ForSequenceClassification to the tests

* Add LayoutLMv2ForQuestionAnswering

* More improvements

* More improvements

* Small improvements

* Fix _LazyModule

* Fix fast tokenizer

* Move sync_batch_norm to a separate method

* Replace dummies by requires_backends

* Move calculation of visual bounding boxes to separate method + update README

* Add models to main init

* First draft

* More improvements

* More improvements

* More improvements

* More improvements

* More improvements

* Remove is_split_into_words

* More improvements

* Simply tesseract - no use of pandas anymore

* Add LayoutLMv2Processor

* Update is_pytesseract_available

* Fix bugs

* Improve feature extractor

* Fix bug

* Add print statement

* Add truncation of bounding boxes

* Add tests for LayoutLMv2FeatureExtractor and LayoutLMv2Tokenizer

* Improve tokenizer tests

* Make more tokenizer tests pass

* Make more tests pass, add integration tests

* Finish integration tests

* More improvements

* More improvements - update API of the tokenizer

* More improvements

* Remove support for VQA training

* Remove some files

* Improve feature extractor

* Improve documentation and one more tokenizer test

* Make quality and small docs improvements

* Add batched tests for LayoutLMv2Processor, remove fast tokenizer

* Add truncation of labels

* Apply suggestions from code review

* Improve processor tests

* Fix failing tests and add suggestion from code review

* Fix tokenizer test

* Add detectron2 CI job

* Simplify CI job

* Comment out non-detectron2 jobs and specify number of processes

* Add pip install torchvision

* Add durations to see which tests are slow

* Fix tokenizer test and make model tests smaller

* Frist draft

* Use setattr

* Possible fix

* Proposal with configuration

* First draft of fast tokenizer

* More improvements

* Enable fast tokenizer tests

* Make more tests pass

* Make more tests pass

* More improvements

* Addd padding to fast tokenizer

* Mkae more tests pass

* Make more tests pass

* Make all tests pass for fast tokenizer

* Make fast tokenizer support overflowing boxes and labels

* Add support for overflowing_labels to slow tokenizer

* Add support for fast tokenizer to the processor

* Update processor tests for both slow and fast tokenizers

* Add head models to model mappings

* Make style & quality

* Remove Detectron2 config file

* Add configurable option to label all subwords

* Fix test

* Skip visual segment embeddings in test

* Use ResNet-18 backbone in tests instead of ResNet-101

* Proposal

* Re-enable all jobs on CI

* Fix installation of tesseract

* Fix failing test

* Fix index table

* Add LayoutXLM doc page, first draft of code examples

* Improve documentation a lot

* Update expected boxes for Tesseract 4.0.0 beta

* Use offsets to create labels instead of checking if they start with ##

* Update expected boxes for Tesseract 4.1.1

* Fix conflict

* Make variable names cleaner, add docstring, add link to notebooks

* Revert "Fix conflict"

This reverts commit a9b46ce9afe47ebfcfe7b45e6a121d49e74ef2c5.

* Revert to make integration test pass

* Apply suggestions from @LysandreJik's review

* Address @patrickvonplaten's comments

* Remove fixtures DocVQA in favor of dataset on the hub
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent 439e7abd
# This file is autogenerated by the command `make fix-copies`, do not edit.
from ..file_utils import requires_backends
LAYOUTLM_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
class LayoutLMv2Model:
def __init__(self, *args, **kwargs):
requires_backends(self, ["detectron2"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["detectron2"])
......@@ -2004,6 +2004,54 @@ class LayoutLMPreTrainedModel:
requires_backends(cls, ["torch"])
LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
class LayoutLMv2ForQuestionAnswering:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class LayoutLMv2ForSequenceClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class LayoutLMv2ForTokenClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class LayoutLMv2Model:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class LayoutLMv2PreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
LED_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -164,6 +164,15 @@ class LayoutLMTokenizerFast:
requires_backends(cls, ["tokenizers"])
class LayoutLMv2TokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tokenizers"])
class LEDTokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
......
......@@ -36,6 +36,20 @@ class DetrFeatureExtractor:
requires_backends(self, ["vision"])
class LayoutLMv2FeatureExtractor:
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class LayoutLMv2Processor:
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["vision"])
class ViTFeatureExtractor:
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
# coding=utf-8
# Copyright 2021 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers.file_utils import is_pytesseract_available, is_torch_available
from transformers.testing_utils import require_pytesseract, require_torch
from .test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor
class LayoutLMv2FeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=18,
apply_ocr=True,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.apply_ocr = apply_ocr
def prepare_feat_extract_dict(self):
return {"do_resize": self.do_resize, "size": self.size, "apply_ocr": self.apply_ocr}
@require_torch
@require_pytesseract
class LayoutLMv2FeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
feature_extraction_class = LayoutLMv2FeatureExtractor if is_pytesseract_available() else None
def setUp(self):
self.feature_extract_tester = LayoutLMv2FeatureExtractionTester(self)
@property
def feat_extract_dict(self):
return self.feature_extract_tester.prepare_feat_extract_dict()
def test_feat_extract_properties(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
self.assertTrue(hasattr(feature_extractor, "do_resize"))
self.assertTrue(hasattr(feature_extractor, "size"))
self.assertTrue(hasattr(feature_extractor, "apply_ocr"))
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoding = feature_extractor(image_inputs[0], return_tensors="pt")
self.assertEqual(
encoding.pixel_values.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
self.assertIsInstance(encoding.words, list)
self.assertIsInstance(encoding.boxes, list)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_numpy(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_call_pytorch(self):
# Initialize feature_extractor
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = feature_extractor(image_inputs[0], return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
1,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
# Test batched
encoded_images = feature_extractor(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.feature_extract_tester.batch_size,
self.feature_extract_tester.num_channels,
self.feature_extract_tester.size,
self.feature_extract_tester.size,
),
)
def test_layoutlmv2_integration_test(self):
# with apply_OCR = True
feature_extractor = LayoutLMv2FeatureExtractor()
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
image = Image.open(ds[0]["file"]).convert("RGB")
encoding = feature_extractor(image, return_tensors="pt")
self.assertEqual(encoding.pixel_values.shape, (1, 3, 224, 224))
self.assertEqual(len(encoding.words), len(encoding.boxes))
# fmt: off
# the words and boxes were obtained with Tesseract 4.1.1
expected_words = [['11:14', 'to', '11:39', 'a.m', '11:39', 'to', '11:44', 'a.m.', '11:44', 'a.m.', 'to', '12:25', 'p.m.', '12:25', 'to', '12:58', 'p.m.', '12:58', 'to', '4:00', 'p.m.', '2:00', 'to', '5:00', 'p.m.', 'Coffee', 'Break', 'Coffee', 'will', 'be', 'served', 'for', 'men', 'and', 'women', 'in', 'the', 'lobby', 'adjacent', 'to', 'exhibit', 'area.', 'Please', 'move', 'into', 'exhibit', 'area.', '(Exhibits', 'Open)', 'TRRF', 'GENERAL', 'SESSION', '(PART', '|)', 'Presiding:', 'Lee', 'A.', 'Waller', 'TRRF', 'Vice', 'President', '“Introductory', 'Remarks”', 'Lee', 'A.', 'Waller,', 'TRRF', 'Vice', 'Presi-', 'dent', 'Individual', 'Interviews', 'with', 'TRRF', 'Public', 'Board', 'Members', 'and', 'Sci-', 'entific', 'Advisory', 'Council', 'Mem-', 'bers', 'Conducted', 'by', 'TRRF', 'Treasurer', 'Philip', 'G.', 'Kuehn', 'to', 'get', 'answers', 'which', 'the', 'public', 'refrigerated', 'warehousing', 'industry', 'is', 'looking', 'for.', 'Plus', 'questions', 'from', 'the', 'floor.', 'Dr.', 'Emil', 'M.', 'Mrak,', 'University', 'of', 'Cal-', 'ifornia,', 'Chairman,', 'TRRF', 'Board;', 'Sam', 'R.', 'Cecil,', 'University', 'of', 'Georgia', 'College', 'of', 'Agriculture;', 'Dr.', 'Stanley', 'Charm,', 'Tufts', 'University', 'School', 'of', 'Medicine;', 'Dr.', 'Robert', 'H.', 'Cotton,', 'ITT', 'Continental', 'Baking', 'Company;', 'Dr.', 'Owen', 'Fennema,', 'University', 'of', 'Wis-', 'consin;', 'Dr.', 'Robert', 'E.', 'Hardenburg,', 'USDA.', 'Questions', 'and', 'Answers', 'Exhibits', 'Open', 'Capt.', 'Jack', 'Stoney', 'Room', 'TRRF', 'Scientific', 'Advisory', 'Council', 'Meeting', 'Ballroom', 'Foyer']] # noqa: E231
expected_boxes = [[[141, 57, 214, 69], [228, 58, 252, 69], [141, 75, 216, 88], [230, 79, 280, 88], [142, 260, 218, 273], [230, 261, 255, 273], [143, 279, 218, 290], [231, 282, 290, 291], [143, 342, 218, 354], [231, 345, 289, 355], [202, 362, 227, 373], [143, 379, 220, 392], [231, 382, 291, 394], [144, 714, 220, 726], [231, 715, 256, 726], [144, 732, 220, 745], [232, 736, 291, 747], [144, 769, 218, 782], [231, 770, 256, 782], [141, 788, 202, 801], [215, 791, 274, 804], [143, 826, 204, 838], [215, 826, 240, 838], [142, 844, 202, 857], [215, 847, 274, 859], [334, 57, 427, 69], [440, 57, 522, 69], [369, 75, 461, 88], [469, 75, 516, 88], [528, 76, 562, 88], [570, 76, 667, 88], [675, 75, 711, 87], [721, 79, 778, 88], [789, 75, 840, 88], [369, 97, 470, 107], [484, 94, 507, 106], [518, 94, 562, 107], [576, 94, 655, 110], [668, 94, 792, 109], [804, 95, 829, 107], [369, 113, 465, 125], [477, 116, 547, 125], [562, 113, 658, 125], [671, 116, 748, 125], [761, 113, 811, 125], [369, 131, 465, 143], [477, 133, 548, 143], [563, 130, 698, 145], [710, 130, 802, 146], [336, 171, 412, 183], [423, 171, 572, 183], [582, 170, 716, 184], [728, 171, 817, 187], [829, 171, 844, 186], [338, 197, 482, 212], [507, 196, 557, 209], [569, 196, 595, 208], [610, 196, 702, 209], [505, 214, 583, 226], [595, 214, 656, 227], [670, 215, 807, 227], [335, 259, 543, 274], [556, 259, 708, 272], [372, 279, 422, 291], [435, 279, 460, 291], [474, 279, 574, 292], [587, 278, 664, 291], [676, 278, 738, 291], [751, 279, 834, 291], [372, 298, 434, 310], [335, 341, 483, 354], [497, 341, 655, 354], [667, 341, 728, 354], [740, 341, 825, 354], [335, 360, 430, 372], [442, 360, 534, 372], [545, 359, 687, 372], [697, 360, 754, 372], [765, 360, 823, 373], [334, 378, 428, 391], [440, 378, 577, 394], [590, 378, 705, 391], [720, 378, 801, 391], [334, 397, 400, 409], [370, 416, 529, 429], [544, 416, 576, 432], [587, 416, 665, 428], [677, 416, 814, 429], [372, 435, 452, 450], [465, 434, 495, 447], [511, 434, 600, 447], [611, 436, 637, 447], [649, 436, 694, 451], [705, 438, 824, 447], [369, 453, 452, 466], [464, 454, 509, 466], [522, 453, 611, 469], [625, 453, 792, 469], [370, 472, 556, 488], [570, 472, 684, 487], [697, 472, 718, 485], [732, 472, 835, 488], [369, 490, 411, 503], [425, 490, 484, 503], [496, 490, 635, 506], [645, 490, 707, 503], [718, 491, 761, 503], [771, 490, 840, 503], [336, 510, 374, 521], [388, 510, 447, 522], [460, 510, 489, 521], [503, 510, 580, 522], [592, 509, 736, 525], [745, 509, 770, 522], [781, 509, 840, 522], [338, 528, 434, 541], [448, 528, 596, 541], [609, 527, 687, 540], [700, 528, 792, 541], [336, 546, 397, 559], [407, 546, 431, 559], [443, 546, 525, 560], [537, 546, 680, 562], [688, 546, 714, 559], [722, 546, 837, 562], [336, 565, 449, 581], [461, 565, 485, 577], [497, 565, 665, 581], [681, 565, 718, 577], [732, 565, 837, 580], [337, 584, 438, 597], [452, 583, 521, 596], [535, 584, 677, 599], [690, 583, 787, 596], [801, 583, 825, 596], [338, 602, 478, 615], [492, 602, 530, 614], [543, 602, 638, 615], [650, 602, 676, 614], [688, 602, 788, 615], [802, 602, 843, 614], [337, 621, 502, 633], [516, 621, 615, 637], [629, 621, 774, 636], [789, 621, 827, 633], [337, 639, 418, 652], [432, 640, 571, 653], [587, 639, 731, 655], [743, 639, 769, 652], [780, 639, 841, 652], [338, 658, 440, 673], [455, 658, 491, 670], [508, 658, 602, 671], [616, 658, 638, 670], [654, 658, 835, 674], [337, 677, 429, 689], [337, 714, 482, 726], [495, 714, 548, 726], [561, 714, 683, 726], [338, 770, 461, 782], [474, 769, 554, 785], [489, 788, 562, 803], [576, 788, 643, 801], [656, 787, 751, 804], [764, 788, 844, 801], [334, 825, 421, 838], [430, 824, 574, 838], [584, 824, 723, 841], [335, 844, 450, 857], [464, 843, 583, 860], [628, 862, 755, 875], [769, 861, 848, 878]]] # noqa: E231
# fmt: on
self.assertListEqual(encoding.words, expected_words)
self.assertListEqual(encoding.boxes, expected_boxes)
# with apply_OCR = False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
encoding = feature_extractor(image, return_tensors="pt")
self.assertEqual(
encoding.pixel_values.shape,
(
1,
3,
224,
224,
),
)
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch LayoutLMv2 model. """
import os
import random
import tempfile
import unittest
from transformers.file_utils import is_detectron2_available, is_torch_available
from transformers.testing_utils import require_detectron2, require_torch, slow, torch_device
from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
if is_torch_available():
import torch
from transformers import (
MODEL_MAPPING,
LayoutLMv2Config,
LayoutLMv2ForQuestionAnswering,
LayoutLMv2ForSequenceClassification,
LayoutLMv2ForTokenClassification,
LayoutLMv2Model,
)
from transformers.models.layoutlmv2.modeling_layoutlmv2 import LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST
if is_detectron2_available():
from detectron2.structures.image_list import ImageList
class LayoutLMv2ModelTester:
def __init__(
self,
parent,
batch_size=2,
num_channels=3,
image_size=4,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=36,
num_hidden_layers=3,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
image_feature_pool_shape=[7, 7, 256],
coordinate_size=6,
shape_size=6,
num_labels=3,
num_choices=4,
scope=None,
range_bbox=1000,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.image_feature_pool_shape = image_feature_pool_shape
self.coordinate_size = coordinate_size
self.shape_size = shape_size
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
self.range_bbox = range_bbox
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
# Ensure that bbox is legal
for i in range(bbox.shape[0]):
for j in range(bbox.shape[1]):
if bbox[i, j, 3] < bbox[i, j, 1]:
t = bbox[i, j, 3]
bbox[i, j, 3] = bbox[i, j, 1]
bbox[i, j, 1] = t
if bbox[i, j, 2] < bbox[i, j, 0]:
t = bbox[i, j, 2]
bbox[i, j, 2] = bbox[i, j, 0]
bbox[i, j, 0] = t
image = ImageList(
torch.zeros(self.batch_size, self.num_channels, self.image_size, self.image_size), self.image_size
)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
config = LayoutLMv2Config(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
image_feature_pool_shape=self.image_feature_pool_shape,
coordinate_size=self.coordinate_size,
shape_size=self.shape_size,
)
# use smaller resnet backbone to make tests faster
config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
def create_and_check_model(
self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
):
model = LayoutLMv2Model(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, bbox=bbox, image=image, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, bbox=bbox, image=image, token_type_ids=token_type_ids)
result = model(input_ids, bbox=bbox, image=image)
# LayoutLMv2 has a different expected sequence length, namely also visual tokens are added
expected_seq_len = self.seq_length + self.image_feature_pool_shape[0] * self.image_feature_pool_shape[1]
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_for_sequence_classification(
self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
):
config.num_labels = self.num_labels
model = LayoutLMv2ForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
bbox=bbox,
image=image,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=sequence_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_token_classification(
self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
):
config.num_labels = self.num_labels
model = LayoutLMv2ForTokenClassification(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
bbox=bbox,
image=image,
attention_mask=input_mask,
token_type_ids=token_type_ids,
labels=token_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_question_answering(
self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
):
model = LayoutLMv2ForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
bbox=bbox,
image=image,
attention_mask=input_mask,
token_type_ids=token_type_ids,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
bbox,
image,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"bbox": bbox,
"image": image,
"token_type_ids": token_type_ids,
"attention_mask": input_mask,
}
return config, inputs_dict
@require_torch
@require_detectron2
class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
test_pruning = False
test_torchscript = False
all_model_classes = (
(
LayoutLMv2Model,
LayoutLMv2ForSequenceClassification,
LayoutLMv2ForTokenClassification,
LayoutLMv2ForQuestionAnswering,
)
if is_torch_available()
else ()
)
def setUp(self):
self.model_tester = LayoutLMv2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=LayoutLMv2Config, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_model_various_embeddings(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
for type in ["absolute", "relative_key", "relative_key_query"]:
config_and_inputs[0].position_embedding_type = type
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_save_load_fast_init_from_base(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
base_class = MODEL_MAPPING[config.__class__]
if isinstance(base_class, tuple):
base_class = base_class[0]
for model_class in self.all_model_classes:
if model_class == base_class:
continue
# make a copy of model class to not break future tests
# from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
class CopyClass(model_class):
pass
model_class_copy = CopyClass
# make sure that all keys are expected for test
model_class_copy._keys_to_ignore_on_load_missing = []
# make init deterministic, but make sure that
# non-initialized weights throw errors nevertheless
model_class_copy._init_weights = self._mock_init_weights
model = base_class(config)
state_dict = model.state_dict()
# this will often delete a single weight of a multi-weight module
# to test an edge case
random_key_to_del = random.choice(list(state_dict.keys()))
del state_dict[random_key_to_del]
# check that certain keys didn't get saved with the model
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
model_fast_init = model_class_copy.from_pretrained(tmpdirname)
model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
for key in model_fast_init.state_dict().keys():
if key == "layoutlmv2.visual_segment_embedding":
# we skip the visual segment embedding as it has a custom initialization scheme
continue
max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
# LayoutLMv2 has a different expected sequence length
expected_seq_len = (
self.model_tester.seq_length
+ self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
)
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if hasattr(self.model_tester, "num_hidden_states_types"):
added_hidden_states = self.model_tester.num_hidden_states_types
else:
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
# LayoutLMv2 has a different expected sequence length
expected_seq_len = (
self.model_tester.seq_length
+ self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[expected_seq_len, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@slow
def test_model_from_pretrained(self):
for model_name in LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = LayoutLMv2Model.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if "backbone" in name or "visual_segment_embedding" in name:
continue
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def prepare_layoutlmv2_batch_inputs():
# Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
# fmt: off
input_ids = torch.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]],device=torch_device) # noqa: E231
bbox = torch.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]],device=torch_device) # noqa: E231
image = ImageList(torch.randn((2,3,224,224)), image_sizes=[(224,224), (224,224)]) # noqa: E231
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],],device=torch_device) # noqa: E231
token_type_ids = torch.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]],device=torch_device) # noqa: E231
# fmt: on
return input_ids, bbox, image, attention_mask, token_type_ids
@require_torch
@require_detectron2
class LayoutLMv2ModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head(self):
model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased").to(torch_device)
(
input_ids,
bbox,
image,
attention_mask,
token_type_ids,
) = prepare_layoutlmv2_batch_inputs()
# forward pass
outputs = model(
input_ids=input_ids,
bbox=bbox,
image=image,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
)
# verify the sequence output
expected_shape = torch.Size(
(
2,
input_ids.shape[1]
+ model.config.image_feature_pool_shape[0] * model.config.image_feature_pool_shape[1],
model.config.hidden_size,
)
)
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
expected_slice = torch.tensor(
[[-0.1087, 0.0727, -0.3075], [0.0799, -0.0427, -0.0751], [-0.0367, 0.0480, -0.1358]], device=torch_device
)
self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
# verify the pooled output
expected_shape = torch.Size((2, model.config.hidden_size))
self.assertEqual(outputs.pooler_output.shape, expected_shape)
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from typing import List
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
from transformers.models.layoutlmv2 import LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import VOCAB_FILES_NAMES
from transformers.testing_utils import require_pytesseract, require_tokenizers, require_torch, slow
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
@require_pytesseract
@require_tokenizers
class LayoutLMv2ProcessorTest(unittest.TestCase):
tokenizer_class = LayoutLMv2Tokenizer
rust_tokenizer_class = LayoutLMv2TokenizerFast
def setUp(self):
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
feature_extractor_map = {
"do_resize": True,
"size": 224,
"apply_ocr": True,
}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
def get_feature_extractor(self, **kwargs):
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
feature_extractor = self.get_feature_extractor()
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(self.tmpdirname)
# slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
# fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
# different use cases tests
@require_torch
@require_pytesseract
class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
@cached_property
def get_images(self):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
return image_1, image_2
@cached_property
def get_tokenizers(self):
slow_tokenizer = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
fast_tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
return [slow_tokenizer, fast_tokenizer]
@slow
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify image
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# fmt: off
expected_decoding = "[CLS] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_feat_extract = feature_extractor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify images
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# fmt: off
expected_decoding = "[CLS] 7 itc limited report and accounts 2013 itc ’ s brands : an asset for the nation the consumer needs and aspirations they fulfil, the benefit they generate for millions across itc ’ s value chains, the future - ready capabilities that support them, and the value that they create for the country, have made itc ’ s brands national assets, adding to india ’ s competitiveness. it is itc ’ s aspiration to be the no 1 fmcg player in the country, driven by its new fmcg businesses. a recent nielsen report has highlighted that itc's new fmcg businesses are the fastest growing among the top consumer goods companies operating in india. itc takes justifiable pride that, along with generating economic value, these celebrated indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. di wills * ; love delightfully soft skin? aia ans source : https : / / www. industrydocuments. ucsf. edu / docs / snbx0223 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
actual_keys = list(input_processor.keys())
for key in expected_keys:
self.assertIn(key, actual_keys)
# verify input_ids
expected_decoding = "[CLS] hello world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
word_labels = [1, 2]
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] weirdly world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
expected_labels = [-100, 1, -100, 2, -100]
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
word_labels = [[1, 2], [6, 3, 10, 2]]
input_processor = processor(
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] my name is niels [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
# verify labels
expected_labels = [-100, 6, 3, 10, 2, -100, -100]
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
@slow
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
input_processor = processor(images[0], question, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# fmt: off
expected_decoding = "[CLS] what's his name? [SEP] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
input_processor = processor(
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] what's the time [SEP] 7 itc limited report and accounts 2013 itc ’ s [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
# fmt: off
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [74, 136, 161, 158], [74, 136, 161, 158], [74, 136, 161, 158], [1000, 1000, 1000, 1000]] # noqa: E231
# fmt: on
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] what's his name? [SEP] hello world [SEP]"
decoding = tokenizer.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] how old is he? [SEP] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = tokenizer.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "[CLS] what's the time [SEP] my name is niels [SEP]"
decoding = tokenizer.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import os
import shutil
import tempfile
import unittest
from typing import List
from transformers import AddedToken, LayoutLMv2TokenizerFast, SpecialTokensMixin, is_tf_available, is_torch_available
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import (
VOCAB_FILES_NAMES,
BasicTokenizer,
LayoutLMv2Tokenizer,
WordpieceTokenizer,
_is_control,
_is_punctuation,
_is_whitespace,
)
from transformers.testing_utils import (
is_pt_tf_cross_test,
require_pandas,
require_scatter,
require_tokenizers,
require_torch,
slow,
)
from .test_tokenization_common import (
SMALL_TRAINING_CORPUS,
TokenizerTesterMixin,
filter_non_english,
merge_model_tokenizer_mappings,
)
@require_tokenizers
@require_pandas
class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LayoutLMv2Tokenizer
rust_tokenizer_class = LayoutLMv2TokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
from_pretrained_filter = filter_non_english
test_seq2seq = False
def get_words_and_boxes(self):
words = ["a", "weirdly", "test"]
boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]]
return words, boxes
def get_words_and_boxes_batch(self):
words = [["a", "weirdly", "test"], ["hello", "my", "name", "is", "bob"]]
boxes = [
[[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
[[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
]
return words, boxes
def get_question_words_and_boxes(self):
question = "what's his name?"
words = ["a", "weirdly", "test"]
boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]]
return question, words, boxes
def get_question_words_and_boxes_batch(self):
questions = ["what's his name?", "how is he called?"]
words = [["a", "weirdly", "test"], ["what", "a", "laif", "gastn"]]
boxes = [
[[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
[[256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
]
return questions, words, boxes
def setUp(self):
super().setUp()
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"what",
"s",
"his",
"name",
"?",
"a",
"weird",
"##ly",
"test",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running"
return input_text, output_text
def test_chinese(self):
tokenizer = BasicTokenizer()
self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
def test_basic_tokenizer_lower(self):
tokenizer = BasicTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
def test_basic_tokenizer_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_lower_strip_accents_default(self):
tokenizer = BasicTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"]
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
def test_basic_tokenizer_no_lower(self):
tokenizer = BasicTokenizer(do_lower_case=False)
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_no_lower_strip_accents_false(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_no_lower_strip_accents_true(self):
tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
self.assertListEqual(
tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
)
def test_basic_tokenizer_respects_never_split_tokens(self):
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
)
def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
self.assertListEqual(tokenizer.tokenize(""), [])
self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def test_is_whitespace(self):
self.assertTrue(_is_whitespace(" "))
self.assertTrue(_is_whitespace("\t"))
self.assertTrue(_is_whitespace("\r"))
self.assertTrue(_is_whitespace("\n"))
self.assertTrue(_is_whitespace("\u00A0"))
self.assertFalse(_is_whitespace("A"))
self.assertFalse(_is_whitespace("-"))
def test_is_control(self):
self.assertTrue(_is_control("\u0005"))
self.assertFalse(_is_control("A"))
self.assertFalse(_is_control(" "))
self.assertFalse(_is_control("\t"))
self.assertFalse(_is_control("\r"))
def test_is_punctuation(self):
self.assertTrue(_is_punctuation("-"))
self.assertTrue(_is_punctuation("$"))
self.assertTrue(_is_punctuation("`"))
self.assertTrue(_is_punctuation("."))
self.assertFalse(_is_punctuation("A"))
self.assertFalse(_is_punctuation(" "))
def test_clean_text(self):
tokenizer = self.get_tokenizer()
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
self.assertListEqual([tokenizer.tokenize(t) for t in ["Hello", "\xad", "hello"]], [["[UNK]"], [], ["[UNK]"]])
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv2-base-uncased")
question, words, boxes = self.get_question_words_and_boxes()
text = tokenizer.encode(
question.split(),
boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))],
add_special_tokens=False,
)
text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_pair == [101] + text + [102] + text_2 + [102]
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
words[1] = tokenizer_r.mask_token
tokens = tokenizer_r.encode_plus(
words,
boxes=boxes,
return_attention_mask=False,
return_token_type_ids=False,
return_offsets_mapping=True,
add_special_tokens=True,
)
expected_results = [
((0, 0), tokenizer_r.cls_token),
((0, 1), "a"),
((0, 6), tokenizer_r.mask_token),
((0, 4), "test"),
((0, 0), tokenizer_r.sep_token),
]
self.assertEqual(
[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
def test_add_special_tokens(self):
tokenizers: List[LayoutLMv2Tokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
special_token = "[SPECIAL_TOKEN]"
special_token_box = [1000, 1000, 1000, 1000]
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(
[special_token], boxes=[special_token_box], add_special_tokens=False
)
self.assertEqual(len(encoded_special_token), 1)
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)
def test_add_tokens_tokenizer(self):
tokenizers: List[LayoutLMv2Tokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab_size = tokenizer.vocab_size
all_size = len(tokenizer)
self.assertNotEqual(vocab_size, 0)
# We usually have added tokens from the start in tests because our vocab fixtures are
# smaller than the original vocabs - let's not assert this
# self.assertEqual(vocab_size, all_size)
new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
self.assertNotEqual(vocab_size_2, 0)
self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
words = "aaaaa bbbbbb low cccccccccdddddddd l".split()
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
self.assertGreaterEqual(len(tokens), 4)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split()
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
tokens = tokenizer.encode(
words,
boxes=boxes,
add_special_tokens=False,
)
self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
@require_tokenizers
def test_encode_decode_with_spaces(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
tokenizer.add_tokens(new_toks)
input = "[ABC][DEF][ABC][DEF]"
if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] [DEF]"
else:
output = input
encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False)
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
def test_encode_plus_with_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_size = 10
padding_idx = tokenizer.pad_token_id
encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"]
special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids)
# Test 'longest' and 'no_padding' don't do anything
tokenizer.padding_side = "right"
not_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
padding=False,
return_special_tokens_mask=True,
)
not_padded_input_ids = not_padded_sequence["input_ids"]
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids)
self.assertTrue(sequence_length == not_padded_sequence_length)
self.assertTrue(input_ids == not_padded_input_ids)
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
not_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
padding=False,
return_special_tokens_mask=True,
)
not_padded_input_ids = not_padded_sequence["input_ids"]
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids)
self.assertTrue(sequence_length == not_padded_sequence_length)
self.assertTrue(input_ids == not_padded_input_ids)
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
tokenizer.padding_side = "right"
right_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
right_padded_sequence_length = len(right_padded_input_ids)
self.assertTrue(sequence_length + padding_size == right_padded_sequence_length)
self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids)
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
tokenizer.padding_side = "left"
left_padded_sequence = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=sequence_length + padding_size,
padding="max_length",
return_special_tokens_mask=True,
)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
self.assertTrue(sequence_length + padding_size == left_padded_sequence_length)
self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids)
self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask)
if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [0] * padding_size == right_padded_token_type_ids
assert [0] * padding_size + token_type_ids == left_padded_token_type_ids
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask)
self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask)
def test_internal_consistency(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
tokens = []
for word in words:
tokens.extend(tokenizer.tokenize(word))
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
self.assertListEqual(ids, ids_2)
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
self.assertNotEqual(len(tokens_2), 0)
text_2 = tokenizer.decode(ids)
self.assertIsInstance(text_2, str)
output_text = "a weirdly test"
self.assertEqual(text_2, output_text)
def test_mask_output(self):
tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
if (
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
and "token_type_ids" in tokenizer.model_input_names
):
information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
sequences, mask = information["input_ids"], information["token_type_ids"]
self.assertEqual(len(sequences), len(mask))
def test_number_of_added_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# test 1: single sequence
words, boxes = self.get_words_and_boxes()
sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
# Method is implemented (e.g. not GPT-2)
if len(attached_sequences) != 2:
self.assertEqual(
tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences)
)
# test 2: two sequences
question, words, boxes = self.get_question_words_and_boxes()
sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False)
attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True)
# Method is implemented (e.g. not GPT-2)
if len(attached_sequences) != 2:
self.assertEqual(
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
# Encode - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest")
input_p = tokenizer_p.encode(words, boxes=boxes, padding=True)
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True)
input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest")
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode_plus - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest")
input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True)
self.assert_padded_input_match(
input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Encode_plus - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest")
input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True)
self.assert_padded_input_match(
input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Simple input
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
padding="max_length",
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
padding="longest",
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
padding=True,
)
self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest")
input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True)
self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
# Batch_encode_plus - Pair input
questions, words, boxes = self.get_question_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
list(zip(questions, words)),
is_pair=True,
boxes=boxes,
max_length=max_length,
truncation=True,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
list(zip(questions, words)),
is_pair=True,
boxes=boxes,
max_length=max_length,
truncation=True,
padding="max_length",
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
list(zip(questions, words)),
is_pair=True,
boxes=boxes,
padding=True,
)
input_p = tokenizer_p.batch_encode_plus(
list(zip(questions, words)),
is_pair=True,
boxes=boxes,
padding="longest",
)
self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
# Using pad on single examples after tokenization
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus(words, boxes=boxes)
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.encode_plus(words, boxes=boxes)
input_p = tokenizer_r.pad(input_p)
self.assert_padded_input_match(
input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
)
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus(words, boxes=boxes)
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.encode_plus(words, boxes=boxes)
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
# Using pad after tokenization
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
)
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
)
input_p = tokenizer_r.pad(input_p)
self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
# Using pad after tokenization
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
)
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
)
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# Test not batched
words, boxes = self.get_words_and_boxes()
encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
encoded_sequences_2 = tokenizer(words, boxes=boxes)
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
# Test not batched pairs
question, words, boxes = self.get_question_words_and_boxes()
encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
encoded_sequences_2 = tokenizer(words, boxes=boxes)
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
# Test batched
words, boxes = self.get_words_and_boxes_batch()
encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes)
encoded_sequences_2 = tokenizer(words, boxes=boxes)
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
def test_batch_encode_plus_batch_sequence_length(self):
# Tests that all encoded values have the correct size
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes_batch()
encoded_sequences = [
tokenizer.encode_plus(words_example, boxes=boxes_example)
for words_example, boxes_example in zip(words, boxes)
]
encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
maximum_length = len(
max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
)
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
encoded_sequences_padded = [
tokenizer.encode_plus(
words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
)
for words_example, boxes_example in zip(words, boxes)
]
encoded_sequences_batch_padded = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, padding=True
)
self.assertListEqual(
encoded_sequences_padded,
self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
)
# check 'longest' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, padding=True
)
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
)
for key in encoded_sequences_batch_padded_1.keys():
self.assertListEqual(
encoded_sequences_batch_padded_1[key],
encoded_sequences_batch_padded_2[key],
)
# check 'no_padding' is unsensitive to a max length
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, padding=False
)
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
)
for key in encoded_sequences_batch_padded_1.keys():
self.assertListEqual(
encoded_sequences_batch_padded_1[key],
encoded_sequences_batch_padded_2[key],
)
@unittest.skip("batch_encode_plus does not handle overflowing tokens.")
def test_batch_encode_plus_overflowing_tokens(self):
pass
def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
# Right padding tests
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes_batch()
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
encoded_sequences = [
tokenizer.encode_plus(
words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
)
for words_example, boxes_example in zip(words, boxes)
]
encoded_sequences_batch = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
# Left padding tests
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
tokenizer.padding_side = "left"
words, boxes = self.get_words_and_boxes_batch()
max_length = 100
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
encoded_sequences = [
tokenizer.encode_plus(
words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
)
for words_example, boxes_example in zip(words, boxes)
]
encoded_sequences_batch = tokenizer.batch_encode_plus(
words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
)
self.assertListEqual(
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
)
def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
else:
words, boxes = self.get_words_and_boxes()
# empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8)
# for key, value in empty_tokens.items():
# self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
# Should also work with truncation
normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8)
for key, value in normal_tokens.items():
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
# truncation to something which is not a multiple of pad_to_multiple_of raises an error
self.assertRaises(
ValueError,
tokenizer.__call__,
words,
boxes=boxes,
padding=True,
truncation=True,
max_length=12,
pad_to_multiple_of=8,
)
def test_tokenizer_slow_store_full_signature(self):
signature = inspect.signature(self.tokenizer_class.__init__)
tokenizer = self.get_tokenizer()
for parameter_name, parameter in signature.parameters.items():
if parameter.default != inspect.Parameter.empty:
self.assertIn(parameter_name, tokenizer.init_kwargs)
def test_build_inputs_with_special_tokens(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Input tokens id
words, boxes = self.get_words_and_boxes()
input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
# Generate output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_special_tokens_mask_input_pairs(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
words,
boxes=boxes,
add_special_tokens=True,
return_special_tokens_mask=True,
# add_prefix_space=False,
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
def test_special_tokens_mask(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
# Testing single inputs
encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
self.assertEqual(encoded_sequence, filtered_sequence)
def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
self.assertNotEqual(tokenizer.model_max_length, 42)
# Now let's start the test
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# Isolate this from the other tests because we save additional tokens/etc
words, boxes = self.get_words_and_boxes()
tmpdirname = tempfile.mkdtemp()
before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
before_vocab = tokenizer.get_vocab()
tokenizer.save_pretrained(tmpdirname)
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
after_vocab = after_tokenizer.get_vocab()
self.assertListEqual(before_tokens, after_tokens)
self.assertDictEqual(before_vocab, after_vocab)
shutil.rmtree(tmpdirname)
def test_right_and_left_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
sequence = "Sequence"
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_idx = tokenizer.pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "left"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest")
padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False)
padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left
def test_token_type_ids(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# test 1: single sequence
words, boxes = self.get_words_and_boxes()
output = tokenizer(words, boxes=boxes, return_token_type_ids=True)
# Assert that the token type IDs have the same length as the input IDs
self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
# Assert that the token type IDs have the same length as the attention mask
self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
self.assertIn(0, output["token_type_ids"])
self.assertNotIn(1, output["token_type_ids"])
# test 2: two sequences (question + words)
question, words, boxes = self.get_question_words_and_boxes()
output = tokenizer(question, words, boxes, return_token_type_ids=True)
# Assert that the token type IDs have the same length as the input IDs
self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
# Assert that the token type IDs have the same length as the attention mask
self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
self.assertIn(0, output["token_type_ids"])
self.assertIn(1, output["token_type_ids"])
def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
text = ["a", "wonderful", "test"]
boxes = [[1, 8, 12, 20] for _ in range(len(text))]
# No pair
tokens_with_offsets = tokenizer_r.encode_plus(
text,
boxes=boxes,
return_special_tokens_mask=True,
return_offsets_mapping=True,
add_special_tokens=True,
)
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
# Pairs
text = "what's his name"
pair = ["a", "wonderful", "test"]
boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
tokens_with_offsets = tokenizer_r.encode_plus(
text,
pair,
boxes=boxes,
return_special_tokens_mask=True,
return_offsets_mapping=True,
add_special_tokens=True,
)
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
@require_torch
@slow
@require_scatter
def test_torch_encode_plus_sent_to_model(self):
import torch
from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
return
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
return
model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
assert (
(model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
if is_using_common_embeddings
else True
)
# Build sequence
words, boxes = self.get_words_and_boxes()
encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
batch_encoded_sequence = tokenizer.batch_encode_plus(
[words, words], [boxes, boxes], return_tensors="pt"
)
# This should not fail
with torch.no_grad(): # saves some time
model(**encoded_sequence)
model(**batch_encoded_sequence)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
words, boxes = self.get_words_and_boxes()
ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
self.assertListEqual(ids, rust_ids)
def test_tokenization_python_rust_equals(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
# Ensure basic input match
input_p = tokenizer_p.encode_plus(words, boxes=boxes)
input_r = tokenizer_r.encode_plus(words, boxes=boxes)
for key in filter(
lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
):
self.assertSequenceEqual(input_p[key], input_r[key])
input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
for key in filter(
lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
):
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
words = ["hello" for _ in range(1000)]
boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
# Ensure truncation match
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
for key in filter(
lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
):
self.assertSequenceEqual(input_p[key], input_r[key])
# Ensure truncation with stride match
input_p = tokenizer_p.encode_plus(
words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
input_r = tokenizer_r.encode_plus(
words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
for key in filter(
lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
):
self.assertSequenceEqual(input_p[key], input_r[key][0])
def test_embeded_special_tokens(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus(
words,
boxes=boxes,
add_special_tokens=True,
)
tokens_p = tokenizer_p.encode_plus(
words,
boxes=boxes,
add_special_tokens=True,
)
for key in tokens_p.keys():
self.assertEqual(tokens_r[key], tokens_p[key])
if "token_type_ids" in tokens_r:
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
words, boxes = self.get_words_and_boxes()
# tokenize()
no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True)
self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
# encode()
no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True)
self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)
# # batch_encode_plus
words, boxes = self.get_words_and_boxes_batch()
no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True)
for key in no_special_tokens.keys():
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
@slow
def test_layoutlmv2_truncation_integration_test(self):
words, boxes = self.get_words_and_boxes()
tokenizer = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased", model_max_length=512)
for i in range(12, 512):
new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True)
# Ensure that the input IDs are less than the max length defined.
self.assertLessEqual(len(new_encoded_inputs), i)
tokenizer.model_max_length = 20
new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True)
# Ensure that the input IDs are still truncated when no max_length is specified
self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
self.assertLessEqual(len(new_encoded_inputs), 20)
@is_pt_tf_cross_test
def test_batch_encode_plus_tensors(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes_batch()
# A Tensor cannot be build by sequences which are not the same size
self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
if tokenizer.pad_token_id is None:
self.assertRaises(
ValueError,
tokenizer.batch_encode_plus,
words,
boxes=boxes,
padding=True,
return_tensors="pt",
)
self.assertRaises(
ValueError,
tokenizer.batch_encode_plus,
words,
boxes=boxes,
padding="longest",
return_tensors="tf",
)
else:
pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
tensorflow_tensor = tokenizer.batch_encode_plus(
words, boxes=boxes, padding="longest", return_tensors="tf"
)
encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
for key in encoded_sequences.keys():
pytorch_value = pytorch_tensor[key].tolist()
tensorflow_value = tensorflow_tensor[key].numpy().tolist()
encoded_value = encoded_sequences[key]
self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
def test_sequence_ids(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
if not tokenizer.is_fast:
continue
with self.subTest(f"{tokenizer.__class__.__name__}"):
seq_0 = "Test this method."
seq_1 = ["With", "these", "inputs."]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))]
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardless of whether the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(seq_0.split(), boxes=boxes)
self.assertIn(0, output.sequence_ids())
output = tokenizer(seq_0, seq_1, boxes=boxes)
self.assertIn(0, output.sequence_ids())
self.assertIn(1, output.sequence_ids())
if tokenizer.num_special_tokens_to_add(pair=True):
self.assertIn(None, output.sequence_ids())
def test_special_tokens_initialization(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
words = "Hey this is a <special> token".split()
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
r_output = tokenizer_r.encode(words, boxes=boxes)
special_token_id = tokenizer_r.encode(
["<special>"], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False
)[0]
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
tokenizer_p = self.tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
words = "Hey this is a <special> token".split()
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
p_output = tokenizer_p.encode(words, boxes=boxes)
cr_output = tokenizer_cr.encode(words, boxes=boxes)
self.assertEqual(p_output, r_output)
self.assertEqual(cr_output, r_output)
self.assertTrue(special_token_id in p_output)
self.assertTrue(special_token_id in cr_output)
def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
return
tokenizer = self.get_rust_tokenizer()
new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
# Test we can use the new tokenizer with something not seen during training
text = [["this", "is", "the"], ["how", "are", "you"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]]
inputs = new_tokenizer(text, boxes=boxes)
self.assertEqual(len(inputs["input_ids"]), 2)
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
expected_result = "this is the"
if tokenizer.backend_tokenizer.normalizer is not None:
expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
self.assertEqual(expected_result, decoded_input)
# We check that the parameters of the tokenizer remained the same
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
# Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
# Assert the set of special tokens match as we didn't ask to change them
self.assertSequenceEqual(
tokenizer.all_special_tokens_extended,
new_tokenizer.all_special_tokens_extended,
)
self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
def test_training_new_tokenizer_with_special_tokens_change(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
return
tokenizer = self.get_rust_tokenizer()
# Test with a special tokens map
class_signature = inspect.signature(tokenizer.__class__)
if "cls_token" in class_signature.parameters:
new_tokenizer = tokenizer.train_new_from_iterator(
SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
)
cls_id = new_tokenizer.get_vocab()["<cls>"]
self.assertEqual(new_tokenizer.cls_token, "<cls>")
self.assertEqual(new_tokenizer.cls_token_id, cls_id)
# Create a new mapping from the special tokens defined in the original tokenizer
special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
special_tokens_list.remove("additional_special_tokens")
special_tokens_map = {}
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is not None:
special_token = getattr(tokenizer, token)
special_tokens_map[special_token] = f"{special_token}a"
# Train new tokenizer
new_tokenizer = tokenizer.train_new_from_iterator(
SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
)
# Check the changes
for token in special_tokens_list:
# Get the private one to avoid unnecessary warnings.
if getattr(tokenizer, f"_{token}") is None:
continue
special_token = getattr(tokenizer, token)
if special_token in special_tokens_map:
new_special_token = getattr(new_tokenizer, token)
self.assertEqual(special_tokens_map[special_token], new_special_token)
new_id = new_tokenizer.get_vocab()[new_special_token]
self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
# Check if the AddedToken / string format has been kept
for special_token in tokenizer.all_special_tokens_extended:
if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
# The special token must appear identically in the list of the new tokenizer.
self.assertTrue(
special_token in new_tokenizer.all_special_tokens_extended,
f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
)
elif isinstance(special_token, AddedToken):
# The special token must appear in the list of the new tokenizer as an object of type AddedToken with
# the same parameters as the old AddedToken except the content that the user has requested to change.
special_token_str = special_token.content
new_special_token_str = special_tokens_map[special_token_str]
find = False
for candidate in new_tokenizer.all_special_tokens_extended:
if (
isinstance(candidate, AddedToken)
and candidate.content == new_special_token_str
and candidate.lstrip == special_token.lstrip
and candidate.rstrip == special_token.rstrip
and candidate.normalized == special_token.normalized
and candidate.single_word == special_token.single_word
):
find = True
break
self.assertTrue(
find,
(
f"'{new_special_token_str}' doesn't appear in the list "
f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}"
),
)
elif special_token not in special_tokens_map:
# The special token must appear identically in the list of the new tokenizer.
self.assertTrue(
special_token in new_tokenizer.all_special_tokens_extended,
f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
)
else:
# The special token must appear in the list of the new tokenizer as an object of type string.
self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
# Test we can use the new tokenizer with something not seen during training
words = [["this", "is"], ["hello", "🤗"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]]
inputs = new_tokenizer(words, boxes=boxes)
self.assertEqual(len(inputs["input_ids"]), 2)
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
expected_result = "this is"
if tokenizer.backend_tokenizer.normalizer is not None:
expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
self.assertEqual(expected_result, decoded_input)
def test_prepare_for_model(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
# only test prepare_for_model for the slow tokenizer
if tokenizer.__class__.__name__ == "LayoutLMv2TokenizerFast":
continue
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
self.assertEqual(input_dict, prepared_input_dict)
def test_padding_different_model_input_name(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
# rename encoded batch to "inputs"
input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
del input_r[tokenizer_r.model_input_names[0]]
input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
del input_p[tokenizer_p.model_input_names[0]]
# Renaming `input_ids` to `inputs`
tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
input_r = tokenizer_r.pad(input_r, padding="longest")
input_p = tokenizer_r.pad(input_p, padding="longest")
max_length = len(input_p["inputs"][0])
self.assert_batch_padded_input_match(
input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
)
def test_batch_encode_dynamic_overflowing(self):
"""
When calling batch_encode with multiple sequences, it can return different number of
overflowing encoding for each sequence:
[
Sequence 1: [Encoding 1, Encoding 2],
Sequence 2: [Encoding 1],
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
]
This needs to be padded so that it can represented as a tensor
"""
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
if is_torch_available():
returned_tensor = "pt"
elif is_tf_available():
returned_tensor = "tf"
else:
returned_tensor = "jax"
# Single example
words, boxes = self.get_words_and_boxes()
tokens = tokenizer.encode_plus(
words,
boxes=boxes,
max_length=6,
padding=True,
truncation=True,
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
if key != "bbox":
self.assertEqual(len(tokens[key].shape), 2)
else:
self.assertEqual(len(tokens[key].shape), 3)
# Batch of examples
# For these 2 examples, 3 training examples will be created
words, boxes = self.get_words_and_boxes_batch()
tokens = tokenizer.batch_encode_plus(
words,
boxes=boxes,
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
if key != "bbox":
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
else:
self.assertEqual(len(tokens[key].shape), 3)
self.assertEqual(tokens[key].shape[-1], 4)
@unittest.skip("TO DO: overwrite this very extensive test.")
def test_alignement_methods(self):
pass
@unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
def test_maximum_encoding_length_pair_input(self):
pass
@unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
def test_maximum_encoding_length_single_input(self):
pass
@unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.")
def test_pretokenized_inputs(self):
pass
@unittest.skip("LayoutLMv2 tokenizer always expects pretokenized inputs.")
def test_compare_pretokenized_inputs(self):
pass
@unittest.skip("LayoutLMv2 fast tokenizer does not support prepare_for_model")
def test_compare_prepare_for_model(self):
pass
@slow
def test_only_label_first_subword(self):
words = ["hello", "niels"]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
word_labels = [0, 1]
# test slow tokenizer
tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
tokenizer_p = LayoutLMv2Tokenizer.from_pretrained(
"microsoft/layoutlmv2-base-uncased", only_label_first_subword=False
)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
# test fast tokenizer
tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
tokenizer_r = LayoutLMv2Tokenizer.from_pretrained(
"microsoft/layoutlmv2-base-uncased", only_label_first_subword=False
)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
@slow
def test_layoutlmv2_integration_test(self):
tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
# There are 3 cases:
# CASE 1: document image classification (training + inference), document image token classification (inference),
# in which case only words and normalized bounding boxes are provided to the tokenizer
# CASE 2: document image token classification (training),
# in which case one also provides word labels to the tokenizer
# CASE 3: document image visual question answering (inference),
# in which case one also provides a question to the tokenizer
# We need to test all 3 cases both on batched and non-batched inputs.
# CASE 1: not batched
words, boxes = self.get_words_and_boxes()
# fmt: off
expected_results = {'input_ids': [101, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
# CASE 1: batched
words, boxes = self.get_words_and_boxes_batch()
# fmt: off
expected_results = {'input_ids': [[101, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 7592, 2026, 2171, 2003, 3960, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
# CASE 2: not batched
words, boxes = self.get_words_and_boxes()
word_labels = [1, 2, 3]
# fmt: off
expected_results = {'input_ids': [101, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
# CASE 2: batched
words, boxes = self.get_words_and_boxes_batch()
word_labels = [[1, 2, 3], [2, 46, 17, 22, 3]]
# fmt: off
expected_results = {'input_ids': [[101, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 7592, 2026, 2171, 2003, 3960, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'labels': [[-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
# CASE 3: not batched
question, words, boxes = self.get_question_words_and_boxes()
# fmt: off
expected_results = {'input_ids': [101, 2054, 1005, 1055, 2010, 2171, 1029, 102, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
# CASE 3: batched
questions, words, boxes = self.get_question_words_and_boxes_batch()
# fmt: off
expected_results = {'input_ids': [[101, 2054, 1005, 1055, 2010, 2171, 1029, 102, 1037, 6881, 2135, 3231, 102, 0, 0, 0, 0, 0, 0, 0], [101, 2129, 2003, 2002, 2170, 1029, 102, 2054, 1037, 21110, 2546, 3806, 2102, 2078, 102, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]} # noqa: E231
# fmt: on
encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
self.assertDictEqual(dict(encoding_p), expected_results)
self.assertDictEqual(dict(encoding_r), expected_results)
@unittest.skip("Doesn't support another framework than PyTorch")
def test_np_encode_plus_sent_to_model(self):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment