"docs/source/vscode:/vscode.git/clone" did not exist on "fc1ba6fd116ca3d3fdc0c856887be68b859f1018"
Unverified Commit 4e441e52 authored by Thibault Douzon's avatar Thibault Douzon Committed by GitHub
Browse files

fix LayoutLMv3TokenizerFast subword label after 'Ġ' token (#21695)

LayoutLMv3TokenizerFast produces empty 'Ġ' token with `offset_mapping = (0, 0)`.
Next token is wrongly assumed to also be beginning of word and isn't
correctly assigned `pad_token_label`.
Modify test with text that produce 'Ġ' token.
Remove copy check from LayoutLMv2TokenizerFast for `_batch_encode_plus`.

solves issue: #19978
parent a6001056
...@@ -508,7 +508,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): ...@@ -508,7 +508,6 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
**kwargs, **kwargs,
) )
# Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._batch_encode_plus with LayoutLMv2->LayoutLMv3
def _batch_encode_plus( def _batch_encode_plus(
self, self,
batch_text_or_text_pairs: Union[ batch_text_or_text_pairs: Union[
...@@ -640,6 +639,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): ...@@ -640,6 +639,7 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
else: else:
original_index = batch_index original_index = batch_index
labels_example = [] labels_example = []
previous_token_empty = False
for id, offset, word_id in zip( for id, offset, word_id in zip(
sanitized_tokens["input_ids"][batch_index], sanitized_tokens["input_ids"][batch_index],
sanitized_tokens["offset_mapping"][batch_index], sanitized_tokens["offset_mapping"][batch_index],
...@@ -647,11 +647,15 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast): ...@@ -647,11 +647,15 @@ class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
): ):
if word_id is not None: if word_id is not None:
if self.only_label_first_subword: if self.only_label_first_subword:
if offset[0] == 0: if offset[0] == 0 and not previous_token_empty:
# Use the real label id for the first token of the word, and padding ids for the remaining tokens # Use the real label id for the first token of the word, and padding ids for the remaining tokens
labels_example.append(word_labels[original_index][word_id]) labels_example.append(word_labels[original_index][word_id])
else: else:
labels_example.append(self.pad_token_label) labels_example.append(self.pad_token_label)
if offset == (0, 0):
previous_token_empty = True
else:
previous_token_empty = False
else: else:
labels_example.append(word_labels[original_index][word_id]) labels_example.append(word_labels[original_index][word_id])
else: else:
......
...@@ -2277,14 +2277,14 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -2277,14 +2277,14 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow @slow
def test_only_label_first_subword(self): def test_only_label_first_subword(self):
words = ["hello", "niels"] words = ["hello", "niels", "0000000000000000"]
boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
word_labels = [0, 1] word_labels = [0, 1, 2]
# test slow tokenizer # test slow tokenizer
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100]) self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
tokenizer_p = LayoutLMv3Tokenizer.from_pretrained( tokenizer_p = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base", "microsoft/layoutlmv3-base",
...@@ -2292,12 +2292,12 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -2292,12 +2292,12 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
add_visual_labels=False, add_visual_labels=False,
) )
encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100]) self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
# test fast tokenizer # test fast tokenizer
tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False)
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100]) self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
tokenizer_r = LayoutLMv3Tokenizer.from_pretrained( tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
"microsoft/layoutlmv3-base", "microsoft/layoutlmv3-base",
...@@ -2305,7 +2305,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -2305,7 +2305,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
add_visual_labels=False, add_visual_labels=False,
) )
encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100]) self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
@slow @slow
def test_layoutlmv3_integration_test(self): def test_layoutlmv3_integration_test(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment