fix LayoutLMv3TokenizerFast subword label after 'Ġ' token (#21695)

LayoutLMv3TokenizerFast produces empty 'Ġ' token with `offset_mapping = (0, 0)`. Next token is wrongly assumed to also be beginning of word and isn't correctly assigned `pad_token_label`. Modify test with text that produce 'Ġ' token. Remove copy check from LayoutLMv2TokenizerFast for `_batch_encode_plus`. solves issue: #19978

fix LayoutLMv3TokenizerFast subword label after 'Ġ' token (#21695)
LayoutLMv3TokenizerFast produces empty 'Ġ' token with `offset_mapping = (0, 0)`. Next token is wrongly assumed to also be beginning of word and isn't correctly assigned `pad_token_label`. Modify test with text that produce 'Ġ' token. Remove copy check from LayoutLMv2TokenizerFast for `_batch_encode_plus`. solves issue: #19978
4e441e52 · Thibault Douzon · GitHub · a6001056 · 4e441e52 · 4e441e52
Unverified Commit 4e441e52 authored Apr 03, 2023 by Thibault Douzon Committed by GitHub Apr 03, 2023
2 changed files
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -508,7 +508,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
            **kwargs,
        )

-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
@@ -640,6 +639,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
                else:
                    original_index = batch_index
                labels_example = []
+                previous_token_empty = False
                for id, offset, word_id in zip(
                    sanitized_tokens["input_ids"][batch_index],
                    sanitized_tokens["offset_mapping"][batch_index],
@@ -647,11 +647,15 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
                ):
                    if word_id is not None:
                        if self.only_label_first_subword:
-                            if offset[0] == 0:
+                            if offset[0] == 0 and not previous_token_empty:
                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                                labels_example.append(word_labels[original_index][word_id])
                            else:
                                labels_example.append(self.pad_token_label)
+                            if offset == (0, 0):
+                                previous_token_empty = True
+                            else:
+                                previous_token_empty = False
                        else:
                            labels_example.append(word_labels[original_index][word_id])
                    else:

--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -2277,14 +2277,14 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):

    @slow
    def test_only_label_first_subword(self):
-        words = ["hello", "niels"]
+        words = ["hello", "niels", "0000000000000000"]
        boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-        word_labels = [0, 1]
+        word_labels = [0, 1, 2]

        # test slow tokenizer
        tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])

        tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
            "microsoft/layoutlmv3-base",
@@ -2292,12 +2292,12 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            add_visual_labels=False,
        )
        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])

        # test fast tokenizer
        tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])

        tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
            "microsoft/layoutlmv3-base",
@@ -2305,7 +2305,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            add_visual_labels=False,
        )
        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
+        self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])

    @slow
    def test_layoutlmv3_integration_test(self):