Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a98f6a1d
Unverified
Commit
a98f6a1d
authored
Aug 30, 2022
by
anthony2261
Committed by
GitHub
Aug 30, 2022
Browse files
LayoutXLMProcessor: ensure 1-to-1 mapping between samples and images, and add test for it (#18774)
parent
220da3b8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
0 deletions
+37
-0
src/transformers/models/layoutxlm/processing_layoutxlm.py
src/transformers/models/layoutxlm/processing_layoutxlm.py
+3
-0
tests/models/layoutxlm/test_processor_layoutxlm.py
tests/models/layoutxlm/test_processor_layoutxlm.py
+34
-0
No files found.
src/transformers/models/layoutxlm/processing_layoutxlm.py
View file @
a98f6a1d
...
@@ -89,6 +89,9 @@ class LayoutXLMProcessor(ProcessorMixin):
...
@@ -89,6 +89,9 @@ class LayoutXLMProcessor(ProcessorMixin):
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
)
)
if
return_overflowing_tokens
is
True
and
return_offsets_mapping
is
False
:
raise
ValueError
(
"You cannot return overflowing tokens without returning the offsets mapping."
)
# first, apply the feature extractor
# first, apply the feature extractor
features
=
self
.
feature_extractor
(
images
=
images
,
return_tensors
=
return_tensors
)
features
=
self
.
feature_extractor
(
images
=
images
,
return_tensors
=
return_tensors
)
...
...
tests/models/layoutxlm/test_processor_layoutxlm.py
View file @
a98f6a1d
...
@@ -126,6 +126,40 @@ class LayoutXLMProcessorTest(unittest.TestCase):
...
@@ -126,6 +126,40 @@ class LayoutXLMProcessorTest(unittest.TestCase):
self
.
assertEqual
(
processor
.
feature_extractor
.
to_json_string
(),
feature_extractor_add_kwargs
.
to_json_string
())
self
.
assertEqual
(
processor
.
feature_extractor
.
to_json_string
(),
feature_extractor_add_kwargs
.
to_json_string
())
self
.
assertIsInstance
(
processor
.
feature_extractor
,
LayoutLMv2FeatureExtractor
)
self
.
assertIsInstance
(
processor
.
feature_extractor
,
LayoutLMv2FeatureExtractor
)
@
slow
def
test_overflowing_tokens
(
self
):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from
datasets
import
load_dataset
# set up
datasets
=
load_dataset
(
"nielsr/funsd"
)
processor
=
LayoutXLMProcessor
.
from_pretrained
(
"microsoft/layoutxlm-base"
,
apply_ocr
=
False
)
def
preprocess_data
(
examples
):
images
=
[
Image
.
open
(
path
).
convert
(
"RGB"
)
for
path
in
examples
[
"image_path"
]]
words
=
examples
[
"words"
]
boxes
=
examples
[
"bboxes"
]
word_labels
=
examples
[
"ner_tags"
]
encoded_inputs
=
processor
(
images
,
words
,
boxes
=
boxes
,
word_labels
=
word_labels
,
max_length
=
512
,
padding
=
"max_length"
,
truncation
=
True
,
return_overflowing_tokens
=
True
,
stride
=
50
,
return_offsets_mapping
=
True
,
return_tensors
=
"pt"
,
)
return
encoded_inputs
train_data
=
preprocess_data
(
datasets
[
"train"
])
self
.
assertEqual
(
len
(
train_data
[
"image"
]),
len
(
train_data
[
"input_ids"
]))
# different use cases tests
# different use cases tests
@
require_sentencepiece
@
require_sentencepiece
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment