"tests/vscode:/vscode.git/clone" did not exist on "6824461f2a35546a3d781fe60576e00f6db7bedf"
Unverified Commit a98f6a1d authored by anthony2261's avatar anthony2261 Committed by GitHub
Browse files

LayoutXLMProcessor: ensure 1-to-1 mapping between samples and images, and add test for it (#18774)

parent 220da3b8
......@@ -89,6 +89,9 @@ class LayoutXLMProcessor(ProcessorMixin):
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
)
if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
......
......@@ -126,6 +126,40 @@ class LayoutXLMProcessorTest(unittest.TestCase):
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
@slow
def test_overflowing_tokens(self):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False)
def preprocess_data(examples):
images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
words = examples["words"]
boxes = examples["bboxes"]
word_labels = examples["ner_tags"]
encoded_inputs = processor(
images,
words,
boxes=boxes,
word_labels=word_labels,
max_length=512,
padding="max_length",
truncation=True,
return_overflowing_tokens=True,
stride=50,
return_offsets_mapping=True,
return_tensors="pt",
)
return encoded_inputs
train_data = preprocess_data(datasets["train"])
self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
# different use cases tests
@require_sentencepiece
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment