Unverified Commit 2ef77421 authored by Ankur Goyal's avatar Ankur Goyal Committed by GitHub
Browse files

Add DocumentQuestionAnswering pipeline (#18414)



* [WIP] Skeleton of VisualQuestionAnweringPipeline extended to support LayoutLM-like models

* Fixup

* Use the full encoding

* Basic refactoring to DocumentQuestionAnsweringPipeline

* Cleanup

* Improve args, docs, and implement preprocessing

* Integrate OCR

* Refactor question_answering pipeline

* Use refactored QA code in the document qa pipeline

* Fix tests

* Some small cleanups

* Use a string type annotation for Image.Image

* Update encoding with image features

* Wire through the basic docs

* Handle invalid response

* Handle empty word_boxes properly

* Docstring fix

* Integrate Donut model

* Fixup

* Incorporate comments

* Address comments

* Initial incorporation of tests

* Address Comments

* Change assert to ValueError

* Comments

* Wrap `score` in float to make it JSON serializable

* Incorporate AutoModeLForDocumentQuestionAnswering changes

* Fixup

* Rename postprocess function

* Fix auto import

* Applying comments

* Improve docs

* Remove extra assets and add copyright

* Address comments
Co-authored-by: default avatarAnkur Goyal <ankur@impira.com>
parent 3059d80d
...@@ -25,6 +25,7 @@ There are two categories of pipeline abstractions to be aware about: ...@@ -25,6 +25,7 @@ There are two categories of pipeline abstractions to be aware about:
- [`AudioClassificationPipeline`] - [`AudioClassificationPipeline`]
- [`AutomaticSpeechRecognitionPipeline`] - [`AutomaticSpeechRecognitionPipeline`]
- [`ConversationalPipeline`] - [`ConversationalPipeline`]
- [`DocumentQuestionAnsweringPipeline`]
- [`FeatureExtractionPipeline`] - [`FeatureExtractionPipeline`]
- [`FillMaskPipeline`] - [`FillMaskPipeline`]
- [`ImageClassificationPipeline`] - [`ImageClassificationPipeline`]
...@@ -342,6 +343,12 @@ That should enable you to do all the custom code you want. ...@@ -342,6 +343,12 @@ That should enable you to do all the custom code you want.
- __call__ - __call__
- all - all
### DocumentQuestionAnsweringPipeline
[[autodoc]] DocumentQuestionAnsweringPipeline
- __call__
- all
### FeatureExtractionPipeline ### FeatureExtractionPipeline
[[autodoc]] FeatureExtractionPipeline [[autodoc]] FeatureExtractionPipeline
......
...@@ -114,6 +114,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its ...@@ -114,6 +114,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
[[autodoc]] AutoModelForTableQuestionAnswering [[autodoc]] AutoModelForTableQuestionAnswering
## AutoModelForDocumentQuestionAnswering
[[autodoc]] AutoModelForDocumentQuestionAnswering
## AutoModelForImageClassification ## AutoModelForImageClassification
[[autodoc]] AutoModelForImageClassification [[autodoc]] AutoModelForImageClassification
...@@ -214,6 +218,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its ...@@ -214,6 +218,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
[[autodoc]] TFAutoModelForTableQuestionAnswering [[autodoc]] TFAutoModelForTableQuestionAnswering
## TFAutoModelForDocumentQuestionAnswering
[[autodoc]] TFAutoModelForDocumentQuestionAnswering
## TFAutoModelForTokenClassification ## TFAutoModelForTokenClassification
[[autodoc]] TFAutoModelForTokenClassification [[autodoc]] TFAutoModelForTokenClassification
......
...@@ -383,6 +383,7 @@ _import_structure = { ...@@ -383,6 +383,7 @@ _import_structure = {
"Conversation", "Conversation",
"ConversationalPipeline", "ConversationalPipeline",
"CsvPipelineDataFormat", "CsvPipelineDataFormat",
"DocumentQuestionAnsweringPipeline",
"FeatureExtractionPipeline", "FeatureExtractionPipeline",
"FillMaskPipeline", "FillMaskPipeline",
"ImageClassificationPipeline", "ImageClassificationPipeline",
...@@ -789,6 +790,7 @@ else: ...@@ -789,6 +790,7 @@ else:
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING", "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
"MODEL_FOR_CAUSAL_LM_MAPPING", "MODEL_FOR_CAUSAL_LM_MAPPING",
"MODEL_FOR_CTC_MAPPING", "MODEL_FOR_CTC_MAPPING",
"MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
...@@ -816,6 +818,7 @@ else: ...@@ -816,6 +818,7 @@ else:
"AutoModelForAudioXVector", "AutoModelForAudioXVector",
"AutoModelForCausalLM", "AutoModelForCausalLM",
"AutoModelForCTC", "AutoModelForCTC",
"AutoModelForDocumentQuestionAnswering",
"AutoModelForImageClassification", "AutoModelForImageClassification",
"AutoModelForImageSegmentation", "AutoModelForImageSegmentation",
"AutoModelForInstanceSegmentation", "AutoModelForInstanceSegmentation",
...@@ -2107,6 +2110,7 @@ else: ...@@ -2107,6 +2110,7 @@ else:
"TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING", "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING", "TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
...@@ -2124,6 +2128,7 @@ else: ...@@ -2124,6 +2128,7 @@ else:
"TFAutoModelForMultipleChoice", "TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction", "TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining", "TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering", "TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation", "TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM", "TFAutoModelForSeq2SeqLM",
...@@ -3200,6 +3205,7 @@ if TYPE_CHECKING: ...@@ -3200,6 +3205,7 @@ if TYPE_CHECKING:
Conversation, Conversation,
ConversationalPipeline, ConversationalPipeline,
CsvPipelineDataFormat, CsvPipelineDataFormat,
DocumentQuestionAnsweringPipeline,
FeatureExtractionPipeline, FeatureExtractionPipeline,
FillMaskPipeline, FillMaskPipeline,
ImageClassificationPipeline, ImageClassificationPipeline,
...@@ -3549,6 +3555,7 @@ if TYPE_CHECKING: ...@@ -3549,6 +3555,7 @@ if TYPE_CHECKING:
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_CTC_MAPPING, MODEL_FOR_CTC_MAPPING,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
...@@ -3576,6 +3583,7 @@ if TYPE_CHECKING: ...@@ -3576,6 +3583,7 @@ if TYPE_CHECKING:
AutoModelForAudioXVector, AutoModelForAudioXVector,
AutoModelForCausalLM, AutoModelForCausalLM,
AutoModelForCTC, AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification, AutoModelForImageClassification,
AutoModelForImageSegmentation, AutoModelForImageSegmentation,
AutoModelForInstanceSegmentation, AutoModelForInstanceSegmentation,
...@@ -4637,6 +4645,7 @@ if TYPE_CHECKING: ...@@ -4637,6 +4645,7 @@ if TYPE_CHECKING:
) )
from .models.auto import ( from .models.auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING,
...@@ -4655,6 +4664,7 @@ if TYPE_CHECKING: ...@@ -4655,6 +4664,7 @@ if TYPE_CHECKING:
TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel, TFAutoModel,
TFAutoModelForCausalLM, TFAutoModelForCausalLM,
TFAutoModelForDocumentQuestionAnswering,
TFAutoModelForImageClassification, TFAutoModelForImageClassification,
TFAutoModelForMaskedLM, TFAutoModelForMaskedLM,
TFAutoModelForMultipleChoice, TFAutoModelForMultipleChoice,
......
...@@ -47,6 +47,7 @@ else: ...@@ -47,6 +47,7 @@ else:
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING", "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
"MODEL_FOR_CAUSAL_LM_MAPPING", "MODEL_FOR_CAUSAL_LM_MAPPING",
"MODEL_FOR_CTC_MAPPING", "MODEL_FOR_CTC_MAPPING",
"MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
...@@ -93,6 +94,7 @@ else: ...@@ -93,6 +94,7 @@ else:
"AutoModelForVideoClassification", "AutoModelForVideoClassification",
"AutoModelForVision2Seq", "AutoModelForVision2Seq",
"AutoModelForVisualQuestionAnswering", "AutoModelForVisualQuestionAnswering",
"AutoModelForDocumentQuestionAnswering",
"AutoModelWithLMHead", "AutoModelWithLMHead",
] ]
...@@ -111,6 +113,7 @@ else: ...@@ -111,6 +113,7 @@ else:
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING", "TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
...@@ -127,6 +130,7 @@ else: ...@@ -127,6 +130,7 @@ else:
"TFAutoModelForMultipleChoice", "TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction", "TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining", "TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering", "TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation", "TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM", "TFAutoModelForSeq2SeqLM",
...@@ -191,6 +195,7 @@ if TYPE_CHECKING: ...@@ -191,6 +195,7 @@ if TYPE_CHECKING:
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_CTC_MAPPING, MODEL_FOR_CTC_MAPPING,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
...@@ -218,6 +223,7 @@ if TYPE_CHECKING: ...@@ -218,6 +223,7 @@ if TYPE_CHECKING:
AutoModelForAudioXVector, AutoModelForAudioXVector,
AutoModelForCausalLM, AutoModelForCausalLM,
AutoModelForCTC, AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification, AutoModelForImageClassification,
AutoModelForImageSegmentation, AutoModelForImageSegmentation,
AutoModelForInstanceSegmentation, AutoModelForInstanceSegmentation,
...@@ -248,6 +254,7 @@ if TYPE_CHECKING: ...@@ -248,6 +254,7 @@ if TYPE_CHECKING:
else: else:
from .modeling_tf_auto import ( from .modeling_tf_auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING,
...@@ -266,6 +273,7 @@ if TYPE_CHECKING: ...@@ -266,6 +273,7 @@ if TYPE_CHECKING:
TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel, TFAutoModel,
TFAutoModelForCausalLM, TFAutoModelForCausalLM,
TFAutoModelForDocumentQuestionAnswering,
TFAutoModelForImageClassification, TFAutoModelForImageClassification,
TFAutoModelForMaskedLM, TFAutoModelForMaskedLM,
TFAutoModelForMultipleChoice, TFAutoModelForMultipleChoice,
......
...@@ -603,6 +603,14 @@ MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( ...@@ -603,6 +603,14 @@ MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
] ]
) )
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
("layoutlm", "LayoutLMForQuestionAnswering"),
("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
("layoutlmv3", "LayoutLMv3ForQuestionAnswering"),
]
)
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[ [
# Model for Token Classification mapping # Model for Token Classification mapping
...@@ -773,6 +781,9 @@ MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FO ...@@ -773,6 +781,9 @@ MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FO
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
) )
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
)
MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES) MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping( MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
...@@ -891,6 +902,17 @@ AutoModelForVisualQuestionAnswering = auto_class_update( ...@@ -891,6 +902,17 @@ AutoModelForVisualQuestionAnswering = auto_class_update(
) )
class AutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
AutoModelForDocumentQuestionAnswering = auto_class_update(
AutoModelForDocumentQuestionAnswering,
head_doc="document question answering",
checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
)
class AutoModelForTokenClassification(_BaseAutoModelClass): class AutoModelForTokenClassification(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
......
...@@ -315,6 +315,13 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( ...@@ -315,6 +315,13 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
] ]
) )
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
("layoutlm", "TFLayoutLMForQuestionAnswering"),
]
)
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[ [
# Model for Table Question Answering mapping # Model for Table Question Answering mapping
...@@ -406,6 +413,9 @@ TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping( ...@@ -406,6 +413,9 @@ TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
) )
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
) )
...@@ -515,6 +525,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass): ...@@ -515,6 +525,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering") TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
TFAutoModelForDocumentQuestionAnswering = auto_class_update(
TFAutoModelForDocumentQuestionAnswering,
head_doc="document question answering",
checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
)
class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass): class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
......
...@@ -51,6 +51,7 @@ from .base import ( ...@@ -51,6 +51,7 @@ from .base import (
infer_framework_load_model, infer_framework_load_model,
) )
from .conversational import Conversation, ConversationalPipeline from .conversational import Conversation, ConversationalPipeline
from .document_question_answering import DocumentQuestionAnsweringPipeline
from .feature_extraction import FeatureExtractionPipeline from .feature_extraction import FeatureExtractionPipeline
from .fill_mask import FillMaskPipeline from .fill_mask import FillMaskPipeline
from .image_classification import ImageClassificationPipeline from .image_classification import ImageClassificationPipeline
...@@ -109,6 +110,7 @@ if is_torch_available(): ...@@ -109,6 +110,7 @@ if is_torch_available():
AutoModelForAudioClassification, AutoModelForAudioClassification,
AutoModelForCausalLM, AutoModelForCausalLM,
AutoModelForCTC, AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification, AutoModelForImageClassification,
AutoModelForImageSegmentation, AutoModelForImageSegmentation,
AutoModelForMaskedLM, AutoModelForMaskedLM,
...@@ -215,6 +217,15 @@ SUPPORTED_TASKS = { ...@@ -215,6 +217,15 @@ SUPPORTED_TASKS = {
}, },
"type": "multimodal", "type": "multimodal",
}, },
"document-question-answering": {
"impl": DocumentQuestionAnsweringPipeline,
"pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
"tf": (),
"default": {
"model": {"pt": ("impira/layoutlm-document-qa", "3a93017")},
},
"type": "multimodal",
},
"fill-mask": { "fill-mask": {
"impl": FillMaskPipeline, "impl": FillMaskPipeline,
"tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (), "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
...@@ -443,7 +454,7 @@ def pipeline( ...@@ -443,7 +454,7 @@ def pipeline(
trust_remote_code: Optional[bool] = None, trust_remote_code: Optional[bool] = None,
model_kwargs: Dict[str, Any] = None, model_kwargs: Dict[str, Any] = None,
pipeline_class: Optional[Any] = None, pipeline_class: Optional[Any] = None,
**kwargs **kwargs,
) -> Pipeline: ) -> Pipeline:
""" """
Utility factory method to build a [`Pipeline`]. Utility factory method to build a [`Pipeline`].
......
...@@ -178,7 +178,7 @@ def infer_framework_load_model( ...@@ -178,7 +178,7 @@ def infer_framework_load_model(
model_classes: Optional[Dict[str, Tuple[type]]] = None, model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None, task: Optional[str] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
**model_kwargs **model_kwargs,
): ):
""" """
Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model). Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
...@@ -274,7 +274,7 @@ def infer_framework_from_model( ...@@ -274,7 +274,7 @@ def infer_framework_from_model(
model_classes: Optional[Dict[str, Tuple[type]]] = None, model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None, task: Optional[str] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
**model_kwargs **model_kwargs,
): ):
""" """
Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model). Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
......
This diff is collapsed.
...@@ -42,6 +42,110 @@ if is_torch_available(): ...@@ -42,6 +42,110 @@ if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
def decode_spans(
start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
) -> Tuple:
"""
Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
answer.
In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
answer end position being before the starting position. The method supports output the k-best answer through the
topk argument.
Args:
start (`np.ndarray`): Individual start probabilities for each token.
end (`np.ndarray`): Individual end probabilities for each token.
topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
"""
# Ensure we have batch axis
if start.ndim == 1:
start = start[None]
if end.ndim == 1:
end = end[None]
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
# Remove candidate with end < start and end - start > max_answer_len
candidates = np.tril(np.triu(outer), max_answer_len - 1)
# Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
scores_flat = candidates.flatten()
if topk == 1:
idx_sort = [np.argmax(scores_flat)]
elif len(scores_flat) < topk:
idx_sort = np.argsort(-scores_flat)
else:
idx = np.argpartition(-scores_flat, topk)[0:topk]
idx_sort = idx[np.argsort(-scores_flat[idx])]
starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
starts = starts[desired_spans]
ends = ends[desired_spans]
scores = candidates[0, starts, ends]
return starts, ends, scores
def select_starts_ends(
start,
end,
p_mask,
attention_mask,
min_null_score=1000000,
top_k=1,
handle_impossible_answer=False,
max_answer_len=15,
):
"""
Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
`decode_spans()` to generate probabilities for each span to be the actual answer.
Args:
start (`np.ndarray`): Individual start logits for each token.
end (`np.ndarray`): Individual end logits for each token.
p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
min_null_score(`float`): The minimum null (empty) answer score seen so far.
topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
handle_impossible_answer(`bool`): Whether to allow null (empty) answers
max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
"""
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
undesired_tokens = np.abs(np.array(p_mask) - 1)
if attention_mask is not None:
undesired_tokens = undesired_tokens & attention_mask
# Generate mask
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start = np.where(undesired_tokens_mask, -10000.0, start)
end = np.where(undesired_tokens_mask, -10000.0, end)
# Normalize logits and spans to retrieve the answer
start = np.exp(start - start.max(axis=-1, keepdims=True))
start = start / start.sum()
end = np.exp(end - end.max(axis=-1, keepdims=True))
end = end / end.sum()
if handle_impossible_answer:
min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
# Mask CLS
start[0, 0] = end[0, 0] = 0.0
starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
return starts, ends, scores, min_null_score
class QuestionAnsweringArgumentHandler(ArgumentHandler): class QuestionAnsweringArgumentHandler(ArgumentHandler):
""" """
QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
...@@ -141,7 +245,7 @@ class QuestionAnsweringPipeline(ChunkPipeline): ...@@ -141,7 +245,7 @@ class QuestionAnsweringPipeline(ChunkPipeline):
framework: Optional[str] = None, framework: Optional[str] = None,
device: int = -1, device: int = -1,
task: str = "", task: str = "",
**kwargs **kwargs,
): ):
super().__init__( super().__init__(
model=model, model=model,
...@@ -410,34 +514,15 @@ class QuestionAnsweringPipeline(ChunkPipeline): ...@@ -410,34 +514,15 @@ class QuestionAnsweringPipeline(ChunkPipeline):
start_ = output["start"] start_ = output["start"]
end_ = output["end"] end_ = output["end"]
example = output["example"] example = output["example"]
p_mask = output["p_mask"]
attention_mask = (
output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
)
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers. starts, ends, scores, min_null_score = select_starts_ends(
undesired_tokens = np.abs(np.array(output["p_mask"]) - 1) start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len
)
if output.get("attention_mask", None) is not None:
undesired_tokens = undesired_tokens & output["attention_mask"].numpy()
# Generate mask
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start_)
end_ = np.where(undesired_tokens_mask, -10000.0, end_)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - start_.max(axis=-1, keepdims=True))
start_ = start_ / start_.sum()
end_ = np.exp(end_ - end_.max(axis=-1, keepdims=True))
end_ = end_ / end_.sum()
if handle_impossible_answer:
min_null_score = min(min_null_score, (start_[0, 0] * end_[0, 0]).item())
# Mask CLS
start_[0, 0] = end_[0, 0] = 0.0
starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
if not self.tokenizer.is_fast: if not self.tokenizer.is_fast:
char_to_word = np.array(example.char_to_word_offset) char_to_word = np.array(example.char_to_word_offset)
...@@ -518,55 +603,6 @@ class QuestionAnsweringPipeline(ChunkPipeline): ...@@ -518,55 +603,6 @@ class QuestionAnsweringPipeline(ChunkPipeline):
end_index = enc.offsets[e][1] end_index = enc.offsets[e][1]
return start_index, end_index return start_index, end_index
def decode(
self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
) -> Tuple:
"""
Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the
actual answer.
In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
answer end position being before the starting position. The method supports output the k-best answer through
the topk argument.
Args:
start (`np.ndarray`): Individual start probabilities for each token.
end (`np.ndarray`): Individual end probabilities for each token.
topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
"""
# Ensure we have batch axis
if start.ndim == 1:
start = start[None]
if end.ndim == 1:
end = end[None]
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
# Remove candidate with end < start and end - start > max_answer_len
candidates = np.tril(np.triu(outer), max_answer_len - 1)
# Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
scores_flat = candidates.flatten()
if topk == 1:
idx_sort = [np.argmax(scores_flat)]
elif len(scores_flat) < topk:
idx_sort = np.argsort(-scores_flat)
else:
idx = np.argpartition(-scores_flat, topk)[0:topk]
idx_sort = idx[np.argsort(-scores_flat[idx])]
starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
starts = starts[desired_spans]
ends = ends[desired_spans]
scores = candidates[0, starts, ends]
return starts, ends, scores
def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]: def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
""" """
When decoding from token probabilities, this method maps token indexes to actual word in the initial context. When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
......
...@@ -358,6 +358,9 @@ MODEL_FOR_CAUSAL_LM_MAPPING = None ...@@ -358,6 +358,9 @@ MODEL_FOR_CAUSAL_LM_MAPPING = None
MODEL_FOR_CTC_MAPPING = None MODEL_FOR_CTC_MAPPING = None
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
...@@ -463,6 +466,13 @@ class AutoModelForCTC(metaclass=DummyObject): ...@@ -463,6 +466,13 @@ class AutoModelForCTC(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
class AutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class AutoModelForImageClassification(metaclass=DummyObject): class AutoModelForImageClassification(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
......
...@@ -265,6 +265,9 @@ class TFAlbertPreTrainedModel(metaclass=DummyObject): ...@@ -265,6 +265,9 @@ class TFAlbertPreTrainedModel(metaclass=DummyObject):
TF_MODEL_FOR_CAUSAL_LM_MAPPING = None TF_MODEL_FOR_CAUSAL_LM_MAPPING = None
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
...@@ -327,6 +330,13 @@ class TFAutoModelForCausalLM(metaclass=DummyObject): ...@@ -327,6 +330,13 @@ class TFAutoModelForCausalLM(metaclass=DummyObject):
requires_backends(self, ["tf"]) requires_backends(self, ["tf"])
class TFAutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFAutoModelForImageClassification(metaclass=DummyObject): class TFAutoModelForImageClassification(metaclass=DummyObject):
_backends = ["tf"] _backends = ["tf"]
......
...@@ -36,6 +36,7 @@ from ..models.auto.modeling_auto import ( ...@@ -36,6 +36,7 @@ from ..models.auto.modeling_auto import (
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_CTC_MAPPING_NAMES,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES,
...@@ -71,6 +72,7 @@ def _generate_supported_model_class_names( ...@@ -71,6 +72,7 @@ def _generate_supported_model_class_names(
"seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
"speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
"multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES, "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
"document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
"question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
"sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
"token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
...@@ -147,7 +149,6 @@ _SPECIAL_SUPPORTED_MODELS = [ ...@@ -147,7 +149,6 @@ _SPECIAL_SUPPORTED_MODELS = [
"GPT2DoubleHeadsModel", "GPT2DoubleHeadsModel",
"Speech2Text2Decoder", "Speech2Text2Decoder",
"TrOCRDecoder", "TrOCRDecoder",
"LayoutLMForQuestionAnswering",
# TODO: add support for them as it should be quite easy to do so (small blocking issues). # TODO: add support for them as it should be quite easy to do so (small blocking issues).
# XLNetForQuestionAnswering, # XLNetForQuestionAnswering,
] ]
...@@ -691,7 +692,7 @@ class HFTracer(Tracer): ...@@ -691,7 +692,7 @@ class HFTracer(Tracer):
inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device) inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
elif model_class_name in [ elif model_class_name in [
*get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES), *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
"LayoutLMForQuestionAnswering", *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
"XLNetForQuestionAnswering", "XLNetForQuestionAnswering",
]: ]:
inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device) inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)
......
...@@ -12,12 +12,9 @@ ...@@ -12,12 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import unittest import unittest
from transformers import LayoutLMConfig, is_torch_available from transformers import LayoutLMConfig, is_torch_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_torch, slow, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
...@@ -28,9 +25,6 @@ if is_torch_available(): ...@@ -28,9 +25,6 @@ if is_torch_available():
import torch import torch
from transformers import ( from transformers import (
MODEL_FOR_MASKED_LM_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
LayoutLMForMaskedLM, LayoutLMForMaskedLM,
LayoutLMForQuestionAnswering, LayoutLMForQuestionAnswering,
LayoutLMForSequenceClassification, LayoutLMForSequenceClassification,
...@@ -273,30 +267,6 @@ class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -273,30 +267,6 @@ class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs) self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if return_labels:
if model_class in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
inputs_dict["labels"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
elif model_class in [
*get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
*get_values(MODEL_FOR_MASKED_LM_MAPPING),
]:
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
elif model_class.__name__ == "LayoutLMForQuestionAnswering":
inputs_dict["start_positions"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
inputs_dict["end_positions"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device
)
return inputs_dict
def prepare_layoutlm_batch_inputs(): def prepare_layoutlm_batch_inputs():
# Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on: # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
......
...@@ -13,13 +13,11 @@ ...@@ -13,13 +13,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import unittest import unittest
import numpy as np import numpy as np
from transformers import LayoutLMConfig, is_tf_available from transformers import LayoutLMConfig, is_tf_available
from transformers.models.auto import get_values
from transformers.testing_utils import require_tf, slow from transformers.testing_utils import require_tf, slow
from ...test_configuration_common import ConfigTester from ...test_configuration_common import ConfigTester
...@@ -29,11 +27,6 @@ from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_at ...@@ -29,11 +27,6 @@ from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_at
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers import (
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
)
from transformers.models.layoutlm.modeling_tf_layoutlm import ( from transformers.models.layoutlm.modeling_tf_layoutlm import (
TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST, TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFLayoutLMForMaskedLM, TFLayoutLMForMaskedLM,
...@@ -263,24 +256,6 @@ class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -263,24 +256,6 @@ class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
model = TFLayoutLMModel.from_pretrained(model_name) model = TFLayoutLMModel.from_pretrained(model_name)
self.assertIsNotNone(model) self.assertIsNotNone(model)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if return_labels:
if model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
elif model_class in [
*get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
*get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
]:
inputs_dict["labels"] = tf.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
)
elif model_class.__name__ == "TFLayoutLMForQuestionAnswering":
inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
return inputs_dict
def prepare_layoutlm_batch_inputs(): def prepare_layoutlm_batch_inputs():
# Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on: # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
......
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, AutoTokenizer, is_vision_available
from transformers.pipelines import pipeline
from transformers.pipelines.document_question_answering import apply_tesseract
from transformers.testing_utils import (
is_pipeline_test,
nested_simplify,
require_detectron2,
require_pytesseract,
require_tf,
require_torch,
require_vision,
slow,
)
from .test_pipelines_common import ANY, PipelineTestCaseMeta
if is_vision_available():
from PIL import Image
from transformers.image_utils import load_image
else:
class Image:
@staticmethod
def open(*args, **kwargs):
pass
def load_image(_):
return None
# This is a pinned image from a specific revision of a document question answering space, hosted by HuggingFace,
# so we can expect it to be available.
INVOICE_URL = (
"https://huggingface.co/spaces/impira/docquery/resolve/2f6c96314dc84dfda62d40de9da55f2f5165d403/invoice.png"
)
@is_pipeline_test
@require_torch
@require_vision
class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
@require_pytesseract
@require_vision
def get_test_pipeline(self, model, tokenizer, feature_extractor):
dqa_pipeline = pipeline(
"document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
)
image = INVOICE_URL
word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
question = "What is the placebo?"
examples = [
{
"image": load_image(image),
"question": question,
},
{
"image": image,
"question": question,
},
{
"image": image,
"question": question,
"word_boxes": word_boxes,
},
{
"image": None,
"question": question,
"word_boxes": word_boxes,
},
]
return dqa_pipeline, examples
def run_pipeline_test(self, dqa_pipeline, examples):
outputs = dqa_pipeline(examples, top_k=2)
self.assertEqual(
outputs,
[
[
{"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
{"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
]
]
* 4,
)
@require_torch
@require_detectron2
@require_pytesseract
def test_small_model_pt(self):
dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-layoutlmv2")
image = INVOICE_URL
question = "How many cats are there?"
expected_output = [
{
"score": 0.0001,
"answer": "2312/2019 DUE DATE 26102/2019 ay DESCRIPTION UNIT PRICE",
"start": 38,
"end": 45,
},
{"score": 0.0001, "answer": "2312/2019 DUE", "start": 38, "end": 39},
]
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
# This image does not detect ANY text in it, meaning layoutlmv2 should fail.
# Empty answer probably
image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(outputs, [])
# We can optionnally pass directly the words and bounding boxes
image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
words = []
boxes = []
outputs = dqa_pipeline(image=image, question=question, words=words, boxes=boxes, top_k=2)
self.assertEqual(outputs, [])
# TODO: Enable this once hf-internal-testing/tiny-random-donut is implemented
# @require_torch
# def test_small_model_pt_donut(self):
# dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-donut")
# # dqa_pipeline = pipeline("document-question-answering", model="../tiny-random-donut")
# image = "https://templates.invoicehome.com/invoice-template-us-neat-750px.png"
# question = "How many cats are there?"
#
# outputs = dqa_pipeline(image=image, question=question, top_k=2)
# self.assertEqual(
# nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
# )
@slow
@require_torch
@require_detectron2
@require_pytesseract
def test_large_model_pt(self):
dqa_pipeline = pipeline(
"document-question-answering",
model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
revision="9977165",
)
image = INVOICE_URL
question = "What is the invoice number?"
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
],
)
outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
],
)
outputs = dqa_pipeline(
[{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
[
{"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
],
]
* 2,
)
@slow
@require_torch
@require_pytesseract
@require_vision
def test_large_model_pt_layoutlm(self):
tokenizer = AutoTokenizer.from_pretrained(
"impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
)
dqa_pipeline = pipeline(
"document-question-answering",
model="impira/layoutlm-document-qa",
tokenizer=tokenizer,
revision="3dc6de3",
)
image = INVOICE_URL
question = "What is the invoice number?"
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
],
)
outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
],
)
outputs = dqa_pipeline(
[{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
[
{"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
]
]
* 2,
)
word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
# This model should also work if `image` is set to None
outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
{"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
],
)
@slow
@require_torch
def test_large_model_pt_donut(self):
dqa_pipeline = pipeline(
"document-question-answering",
model="naver-clova-ix/donut-base-finetuned-docvqa",
tokenizer=AutoTokenizer.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa"),
feature_extractor="naver-clova-ix/donut-base-finetuned-docvqa",
)
image = INVOICE_URL
question = "What is the invoice number?"
outputs = dqa_pipeline(image=image, question=question, top_k=2)
self.assertEqual(nested_simplify(outputs, decimals=4), {"answer": "us-001"})
@require_tf
@unittest.skip("Document question answering not implemented in TF")
def test_small_model_tf(self):
pass
...@@ -89,6 +89,7 @@ if is_torch_available(): ...@@ -89,6 +89,7 @@ if is_torch_available():
MODEL_FOR_AUDIO_XVECTOR_MAPPING, MODEL_FOR_AUDIO_XVECTOR_MAPPING,
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING, MODEL_FOR_MASKED_LM_MAPPING,
...@@ -172,7 +173,10 @@ class ModelTesterMixin: ...@@ -172,7 +173,10 @@ class ModelTesterMixin:
if return_labels: if return_labels:
if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING): if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device) inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): elif model_class in [
*get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
*get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
]:
inputs_dict["start_positions"] = torch.zeros( inputs_dict["start_positions"] = torch.zeros(
self.model_tester.batch_size, dtype=torch.long, device=torch_device self.model_tester.batch_size, dtype=torch.long, device=torch_device
) )
...@@ -542,7 +546,10 @@ class ModelTesterMixin: ...@@ -542,7 +546,10 @@ class ModelTesterMixin:
if "labels" in inputs_dict: if "labels" in inputs_dict:
correct_outlen += 1 # loss is added to beginning correct_outlen += 1 # loss is added to beginning
# Question Answering model returns start_logits and end_logits # Question Answering model returns start_logits and end_logits
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): if model_class in [
*get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
*get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
]:
correct_outlen += 1 # start_logits and end_logits instead of only 1 output correct_outlen += 1 # start_logits and end_logits instead of only 1 output
if "past_key_values" in outputs: if "past_key_values" in outputs:
correct_outlen += 1 # past_key_values have been returned correct_outlen += 1 # past_key_values have been returned
......
...@@ -61,6 +61,7 @@ if is_tf_available(): ...@@ -61,6 +61,7 @@ if is_tf_available():
from transformers import ( from transformers import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING,
...@@ -149,7 +150,10 @@ class TFModelTesterMixin: ...@@ -149,7 +150,10 @@ class TFModelTesterMixin:
if return_labels: if return_labels:
if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING): elif model_class in [
*get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING),
*get_values(TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
]:
inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
elif model_class in [ elif model_class in [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment