Add DocumentQuestionAnswering pipeline (#18414)

* [WIP] Skeleton of VisualQuestionAnweringPipeline extended to support LayoutLM-like models * Fixup * Use the full encoding * Basic refactoring to DocumentQuestionAnsweringPipeline * Cleanup * Improve args, docs, and implement preprocessing * Integrate OCR * Refactor question_answering pipeline * Use refactored QA code in the document qa pipeline * Fix tests * Some small cleanups * Use a string type annotation for Image.Image * Update encoding with image features * Wire through the basic docs * Handle invalid response * Handle empty word_boxes properly * Docstring fix * Integrate Donut model * Fixup * Incorporate comments * Address comments * Initial incorporation of tests * Address Comments * Change assert to ValueError * Comments * Wrap `score` in float to make it JSON serializable * Incorporate AutoModeLForDocumentQuestionAnswering changes * Fixup * Rename postprocess function * Fix auto import * Applying comments * Improve docs * Remove extra assets and add copyright * Address comments Co-authored-by: Ankur Goyal <ankur@impira.com>

Add DocumentQuestionAnswering pipeline (#18414)
* [WIP] Skeleton of VisualQuestionAnweringPipeline extended to support LayoutLM-like models * Fixup * Use the full encoding * Basic refactoring to DocumentQuestionAnsweringPipeline * Cleanup * Improve args, docs, and implement preprocessing * Integrate OCR * Refactor question_answering pipeline * Use refactored QA code in the document qa pipeline * Fix tests * Some small cleanups * Use a string type annotation for Image.Image * Update encoding with image features * Wire through the basic docs * Handle invalid response * Handle empty word_boxes properly * Docstring fix * Integrate Donut model * Fixup * Incorporate comments * Address comments * Initial incorporation of tests * Address Comments * Change assert to ValueError * Comments * Wrap `score` in float to make it JSON serializable * Incorporate AutoModeLForDocumentQuestionAnswering changes * Fixup * Rename postprocess function * Fix auto import * Applying comments * Improve docs * Remove extra assets and add copyright * Address comments Co-authored-by: Ankur Goyal <ankur@impira.com>
2ef77421 · Ankur Goyal · GitHub · 3059d80d · 2ef77421 · 2ef77421
Unverified Commit 2ef77421 authored Sep 07, 2022 by Ankur Goyal Committed by GitHub Sep 07, 2022
18 changed files
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -25,6 +25,7 @@ There are two categories of pipeline abstractions to be aware about:
  - [`AudioClassificationPipeline`]
  - [`AutomaticSpeechRecognitionPipeline`]
  - [`ConversationalPipeline`]
+  - [`DocumentQuestionAnsweringPipeline`]
  - [`FeatureExtractionPipeline`]
  - [`FillMaskPipeline`]
  - [`ImageClassificationPipeline`]
@@ -342,6 +343,12 @@ That should enable you to do all the custom code you want.
    - __call__
    - all
+### DocumentQuestionAnsweringPipeline
+[[autodoc]] DocumentQuestionAnsweringPipeline
+    - __call__
+    - all
 ### FeatureExtractionPipeline
 [[autodoc]] FeatureExtractionPipeline

--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -114,6 +114,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 [[autodoc]] AutoModelForTableQuestionAnswering
+## AutoModelForDocumentQuestionAnswering
+[[autodoc]] AutoModelForDocumentQuestionAnswering
 ## AutoModelForImageClassification
 [[autodoc]] AutoModelForImageClassification
@@ -214,6 +218,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 [[autodoc]] TFAutoModelForTableQuestionAnswering
+## TFAutoModelForDocumentQuestionAnswering
+[[autodoc]] TFAutoModelForDocumentQuestionAnswering
 ## TFAutoModelForTokenClassification
 [[autodoc]] TFAutoModelForTokenClassification

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -383,6 +383,7 @@ _import_structure = {
        "Conversation",
        "ConversationalPipeline",
        "CsvPipelineDataFormat",
+        "DocumentQuestionAnsweringPipeline",
        "FeatureExtractionPipeline",
        "FillMaskPipeline",
        "ImageClassificationPipeline",
@@ -789,6 +790,7 @@ else:
            "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
            "MODEL_FOR_CAUSAL_LM_MAPPING",
            "MODEL_FOR_CTC_MAPPING",
+            "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
            "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
            "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
            "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
@@ -816,6 +818,7 @@ else:
            "AutoModelForAudioXVector",
            "AutoModelForCausalLM",
            "AutoModelForCTC",
+            "AutoModelForDocumentQuestionAnswering",
            "AutoModelForImageClassification",
            "AutoModelForImageSegmentation",
            "AutoModelForInstanceSegmentation",
@@ -2107,6 +2110,7 @@ else:
            "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
            "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
            "TF_MODEL_FOR_PRETRAINING_MAPPING",
+            "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
            "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
            "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
            "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
@@ -2124,6 +2128,7 @@ else:
            "TFAutoModelForMultipleChoice",
            "TFAutoModelForNextSentencePrediction",
            "TFAutoModelForPreTraining",
+            "TFAutoModelForDocumentQuestionAnswering",
            "TFAutoModelForQuestionAnswering",
            "TFAutoModelForSemanticSegmentation",
            "TFAutoModelForSeq2SeqLM",
@@ -3200,6 +3205,7 @@ if TYPE_CHECKING:
        Conversation,
        ConversationalPipeline,
        CsvPipelineDataFormat,
+        DocumentQuestionAnsweringPipeline,
        FeatureExtractionPipeline,
        FillMaskPipeline,
        ImageClassificationPipeline,
@@ -3549,6 +3555,7 @@ if TYPE_CHECKING:
            MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
            MODEL_FOR_CAUSAL_LM_MAPPING,
            MODEL_FOR_CTC_MAPPING,
+            MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -3576,6 +3583,7 @@ if TYPE_CHECKING:
            AutoModelForAudioXVector,
            AutoModelForCausalLM,
            AutoModelForCTC,
+            AutoModelForDocumentQuestionAnswering,
            AutoModelForImageClassification,
            AutoModelForImageSegmentation,
            AutoModelForInstanceSegmentation,
@@ -4637,6 +4645,7 @@ if TYPE_CHECKING:
        )
        from .models.auto import (
            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+            TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
            TF_MODEL_FOR_MASKED_LM_MAPPING,
@@ -4655,6 +4664,7 @@ if TYPE_CHECKING:
            TF_MODEL_WITH_LM_HEAD_MAPPING,
            TFAutoModel,
            TFAutoModelForCausalLM,
+            TFAutoModelForDocumentQuestionAnswering,
            TFAutoModelForImageClassification,
            TFAutoModelForMaskedLM,
            TFAutoModelForMultipleChoice,

--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -47,6 +47,7 @@ else:
        "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
        "MODEL_FOR_CAUSAL_LM_MAPPING",
        "MODEL_FOR_CTC_MAPPING",
+        "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
        "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
        "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
        "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
@@ -93,6 +94,7 @@ else:
        "AutoModelForVideoClassification",
        "AutoModelForVision2Seq",
        "AutoModelForVisualQuestionAnswering",
+        "AutoModelForDocumentQuestionAnswering",
        "AutoModelWithLMHead",
    ]
@@ -111,6 +113,7 @@ else:
        "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
        "TF_MODEL_FOR_PRETRAINING_MAPPING",
        "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+        "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
        "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
        "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
        "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
@@ -127,6 +130,7 @@ else:
        "TFAutoModelForMultipleChoice",
        "TFAutoModelForNextSentencePrediction",
        "TFAutoModelForPreTraining",
+        "TFAutoModelForDocumentQuestionAnswering",
        "TFAutoModelForQuestionAnswering",
        "TFAutoModelForSemanticSegmentation",
        "TFAutoModelForSeq2SeqLM",
@@ -191,6 +195,7 @@ if TYPE_CHECKING:
            MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
            MODEL_FOR_CAUSAL_LM_MAPPING,
            MODEL_FOR_CTC_MAPPING,
+            MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -218,6 +223,7 @@ if TYPE_CHECKING:
            AutoModelForAudioXVector,
            AutoModelForCausalLM,
            AutoModelForCTC,
+            AutoModelForDocumentQuestionAnswering,
            AutoModelForImageClassification,
            AutoModelForImageSegmentation,
            AutoModelForInstanceSegmentation,
@@ -248,6 +254,7 @@ if TYPE_CHECKING:
    else:
        from .modeling_tf_auto import (
            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+            TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
            TF_MODEL_FOR_MASKED_LM_MAPPING,
@@ -266,6 +273,7 @@ if TYPE_CHECKING:
            TF_MODEL_WITH_LM_HEAD_MAPPING,
            TFAutoModel,
            TFAutoModelForCausalLM,
+            TFAutoModelForDocumentQuestionAnswering,
            TFAutoModelForImageClassification,
            TFAutoModelForMaskedLM,
            TFAutoModelForMultipleChoice,

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -603,6 +603,14 @@ MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
    ]
 )
+MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        ("layoutlm", "LayoutLMForQuestionAnswering"),
+        ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
+        ("layoutlmv3", "LayoutLMv3ForQuestionAnswering"),
+    ]
+)
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
    [
        # Model for Token Classification mapping
@@ -773,6 +781,9 @@ MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FO
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
+MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+)
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
@@ -891,6 +902,17 @@ AutoModelForVisualQuestionAnswering = auto_class_update(
 )
+class AutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+AutoModelForDocumentQuestionAnswering = auto_class_update(
+    AutoModelForDocumentQuestionAnswering,
+    head_doc="document question answering",
+    checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
+)
 class AutoModelForTokenClassification(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING

--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -315,6 +315,13 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
    ]
 )
+TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+    [
+        ("layoutlm", "TFLayoutLMForQuestionAnswering"),
+    ]
+)
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
    [
        # Model for Table Question Answering mapping
@@ -406,6 +413,9 @@ TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(
 TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
 )
+TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+)
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -515,6 +525,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
 TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
+class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+TFAutoModelForDocumentQuestionAnswering = auto_class_update(
+    TFAutoModelForDocumentQuestionAnswering,
+    head_doc="document question answering",
+    checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
+)
 class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
    _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING

--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -51,6 +51,7 @@ from .base import (
    infer_framework_load_model,
 )
 from .conversational import Conversation, ConversationalPipeline
+from .document_question_answering import DocumentQuestionAnsweringPipeline
 from .feature_extraction import FeatureExtractionPipeline
 from .fill_mask import FillMaskPipeline
 from .image_classification import ImageClassificationPipeline
@@ -109,6 +110,7 @@ if is_torch_available():
        AutoModelForAudioClassification,
        AutoModelForCausalLM,
        AutoModelForCTC,
+        AutoModelForDocumentQuestionAnswering,
        AutoModelForImageClassification,
        AutoModelForImageSegmentation,
        AutoModelForMaskedLM,
@@ -215,6 +217,15 @@ SUPPORTED_TASKS = {
        },
        "type": "multimodal",
    },
+    "document-question-answering": {
+        "impl": DocumentQuestionAnsweringPipeline,
+        "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("impira/layoutlm-document-qa", "3a93017")},
+        },
+        "type": "multimodal",
+    },
    "fill-mask": {
        "impl": FillMaskPipeline,
        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
@@ -443,7 +454,7 @@ def pipeline(
    trust_remote_code: Optional[bool] = None,
    model_kwargs: Dict[str, Any] = None,
    pipeline_class: Optional[Any] = None,
-    **kwargs
+    **kwargs,
 ) -> Pipeline:
    """
    Utility factory method to build a [`Pipeline`].

--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -178,7 +178,7 @@ def infer_framework_load_model(
    model_classes: Optional[Dict[str, Tuple[type]]] = None,
    task: Optional[str] = None,
    framework: Optional[str] = None,
-    **model_kwargs
+    **model_kwargs,
 ):
    """
    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
@@ -274,7 +274,7 @@ def infer_framework_from_model(
    model_classes: Optional[Dict[str, Tuple[type]]] = None,
    task: Optional[str] = None,
    framework: Optional[str] = None,
-    **model_kwargs
+    **model_kwargs,
 ):
    """
    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).

--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -42,6 +42,110 @@ if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
+def decode_spans(
+    start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
+) -> Tuple:
+    """
+    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
+    answer.
+    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+    answer end position being before the starting position. The method supports output the k-best answer through the
+    topk argument.
+    Args:
+        start (`np.ndarray`): Individual start probabilities for each token.
+        end (`np.ndarray`): Individual end probabilities for each token.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
+    """
+    # Ensure we have batch axis
+    if start.ndim == 1:
+        start = start[None]
+    if end.ndim == 1:
+        end = end[None]
+    # Compute the score of each tuple(start, end) to be the real answer
+    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+    # Remove candidate with end < start and end - start > max_answer_len
+    candidates = np.tril(np.triu(outer), max_answer_len - 1)
+    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+    scores_flat = candidates.flatten()
+    if topk == 1:
+        idx_sort = [np.argmax(scores_flat)]
+    elif len(scores_flat) < topk:
+        idx_sort = np.argsort(-scores_flat)
+    else:
+        idx = np.argpartition(-scores_flat, topk)[0:topk]
+        idx_sort = idx[np.argsort(-scores_flat[idx])]
+    starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
+    desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
+    starts = starts[desired_spans]
+    ends = ends[desired_spans]
+    scores = candidates[0, starts, ends]
+    return starts, ends, scores
+def select_starts_ends(
+    start,
+    end,
+    p_mask,
+    attention_mask,
+    min_null_score=1000000,
+    top_k=1,
+    handle_impossible_answer=False,
+    max_answer_len=15,
+):
+    """
+    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
+    `decode_spans()` to generate probabilities for each span to be the actual answer.
+    Args:
+        start (`np.ndarray`): Individual start logits for each token.
+        end (`np.ndarray`): Individual end logits for each token.
+        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
+        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
+        min_null_score(`float`): The minimum null (empty) answer score seen so far.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+    """
+    # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+    undesired_tokens = np.abs(np.array(p_mask) - 1)
+    if attention_mask is not None:
+        undesired_tokens = undesired_tokens & attention_mask
+    # Generate mask
+    undesired_tokens_mask = undesired_tokens == 0.0
+    # Make sure non-context indexes in the tensor cannot contribute to the softmax
+    start = np.where(undesired_tokens_mask, -10000.0, start)
+    end = np.where(undesired_tokens_mask, -10000.0, end)
+    # Normalize logits and spans to retrieve the answer
+    start = np.exp(start - start.max(axis=-1, keepdims=True))
+    start = start / start.sum()
+    end = np.exp(end - end.max(axis=-1, keepdims=True))
+    end = end / end.sum()
+    if handle_impossible_answer:
+        min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
+    # Mask CLS
+    start[0, 0] = end[0, 0] = 0.0
+    starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
+    return starts, ends, scores, min_null_score
 class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
@@ -141,7 +245,7 @@ class QuestionAnsweringPipeline(ChunkPipeline):
        framework: Optional[str] = None,
        device: int = -1,
        task: str = "",
-        **kwargs
+        **kwargs,
    ):
        super().__init__(
            model=model,
@@ -410,34 +514,15 @@ class QuestionAnsweringPipeline(ChunkPipeline):
            start_ = output["start"]
            end_ = output["end"]
            example = output["example"]
+            p_mask = output["p_mask"]
+            attention_mask = (
+                output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
+            )
-            # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+            starts, ends, scores, min_null_score = select_starts_ends(
-            undesired_tokens = np.abs(np.array(output["p_mask"]) - 1)
+                start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len
+            )
-            if output.get("attention_mask", None) is not None:
-                undesired_tokens = undesired_tokens & output["attention_mask"].numpy()
-            # Generate mask
-            undesired_tokens_mask = undesired_tokens == 0.0
-            # Make sure non-context indexes in the tensor cannot contribute to the softmax
-            start_ = np.where(undesired_tokens_mask, -10000.0, start_)
-            end_ = np.where(undesired_tokens_mask, -10000.0, end_)
-            # Normalize logits and spans to retrieve the answer
-            start_ = np.exp(start_ - start_.max(axis=-1, keepdims=True))
-            start_ = start_ / start_.sum()
-            end_ = np.exp(end_ - end_.max(axis=-1, keepdims=True))
-            end_ = end_ / end_.sum()
-            if handle_impossible_answer:
-                min_null_score = min(min_null_score, (start_[0, 0] * end_[0, 0]).item())
-            # Mask CLS
-            start_[0, 0] = end_[0, 0] = 0.0
-            starts, ends, scores = self.decode(start_, end_, top_k, max_answer_len, undesired_tokens)
            if not self.tokenizer.is_fast:
                char_to_word = np.array(example.char_to_word_offset)
@@ -518,55 +603,6 @@ class QuestionAnsweringPipeline(ChunkPipeline):
            end_index = enc.offsets[e][1]
        return start_index, end_index
-    def decode(
-        self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
-    ) -> Tuple:
-        """
-        Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the
-        actual answer.
-        In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
-        answer end position being before the starting position. The method supports output the k-best answer through
-        the topk argument.
-        Args:
-            start (`np.ndarray`): Individual start probabilities for each token.
-            end (`np.ndarray`): Individual end probabilities for each token.
-            topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
-            max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
-            undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
-        """
-        # Ensure we have batch axis
-        if start.ndim == 1:
-            start = start[None]
-        if end.ndim == 1:
-            end = end[None]
-        # Compute the score of each tuple(start, end) to be the real answer
-        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
-        # Remove candidate with end < start and end - start > max_answer_len
-        candidates = np.tril(np.triu(outer), max_answer_len - 1)
-        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
-        scores_flat = candidates.flatten()
-        if topk == 1:
-            idx_sort = [np.argmax(scores_flat)]
-        elif len(scores_flat) < topk:
-            idx_sort = np.argsort(-scores_flat)
-        else:
-            idx = np.argpartition(-scores_flat, topk)[0:topk]
-            idx_sort = idx[np.argsort(-scores_flat[idx])]
-        starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
-        desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
-        starts = starts[desired_spans]
-        ends = ends[desired_spans]
-        scores = candidates[0, starts, ends]
-        return starts, ends, scores
    def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
        """
        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -358,6 +358,9 @@ MODEL_FOR_CAUSAL_LM_MAPPING = None
 MODEL_FOR_CTC_MAPPING = None
+MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
@@ -463,6 +466,13 @@ class AutoModelForCTC(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+class AutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 class AutoModelForImageClassification(metaclass=DummyObject):
    _backends = ["torch"]

--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -265,6 +265,9 @@ class TFAlbertPreTrainedModel(metaclass=DummyObject):
 TF_MODEL_FOR_CAUSAL_LM_MAPPING = None
+TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
 TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
@@ -327,6 +330,13 @@ class TFAutoModelForCausalLM(metaclass=DummyObject):
        requires_backends(self, ["tf"])
+class TFAutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
+    _backends = ["tf"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
 class TFAutoModelForImageClassification(metaclass=DummyObject):
    _backends = ["tf"]

--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -36,6 +36,7 @@ from ..models.auto.modeling_auto import (
    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
    MODEL_FOR_CTC_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
@@ -71,6 +72,7 @@ def _generate_supported_model_class_names(
        "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
        "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
        "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+        "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
        "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
        "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
        "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
@@ -147,7 +149,6 @@ _SPECIAL_SUPPORTED_MODELS = [
    "GPT2DoubleHeadsModel",
    "Speech2Text2Decoder",
    "TrOCRDecoder",
-    "LayoutLMForQuestionAnswering",
    # TODO: add support for them as it should be quite easy to do so (small blocking issues).
    # XLNetForQuestionAnswering,
 ]
@@ -691,7 +692,7 @@ class HFTracer(Tracer):
                inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
            elif model_class_name in [
                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                "LayoutLMForQuestionAnswering",
+                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
                "XLNetForQuestionAnswering",
            ]:
                inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)

--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -12,12 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import unittest
 from transformers import LayoutLMConfig, is_torch_available
-from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 from ...test_configuration_common import ConfigTester
@@ -28,9 +25,6 @@ if is_torch_available():
    import torch
    from transformers import (
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        LayoutLMForMaskedLM,
        LayoutLMForQuestionAnswering,
        LayoutLMForSequenceClassification,
@@ -273,30 +267,6 @@ class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = torch.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
-                )
-            elif model_class.__name__ == "LayoutLMForQuestionAnswering":
-                inputs_dict["start_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-                inputs_dict["end_positions"] = torch.zeros(
-                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
-                )
-        return inputs_dict
 def prepare_layoutlm_batch_inputs():
    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:

--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
@@ -13,13 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import unittest
 import numpy as np
 from transformers import LayoutLMConfig, is_tf_available
-from transformers.models.auto import get_values
 from transformers.testing_utils import require_tf, slow
 from ...test_configuration_common import ConfigTester
@@ -29,11 +27,6 @@ from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_at
 if is_tf_available():
    import tensorflow as tf
-    from transformers import (
-        TF_MODEL_FOR_MASKED_LM_MAPPING,
-        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    )
    from transformers.models.layoutlm.modeling_tf_layoutlm import (
        TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
        TFLayoutLMForMaskedLM,
@@ -263,24 +256,6 @@ class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
            model = TFLayoutLMModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if return_labels:
-            if model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in [
-                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = tf.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
-                )
-            elif model_class.__name__ == "TFLayoutLMForQuestionAnswering":
-                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-        return inputs_dict
 def prepare_layoutlm_batch_inputs():
    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:

--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, AutoTokenizer, is_vision_available
+from transformers.pipelines import pipeline
+from transformers.pipelines.document_question_answering import apply_tesseract
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_detectron2,
+    require_pytesseract,
+    require_tf,
+    require_torch,
+    require_vision,
+    slow,
+)
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+if is_vision_available():
+    from PIL import Image
+    from transformers.image_utils import load_image
+else:
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+    def load_image(_):
+        return None
+# This is a pinned image from a specific revision of a document question answering space, hosted by HuggingFace,
+# so we can expect it to be available.
+INVOICE_URL = (
+    "https://huggingface.co/spaces/impira/docquery/resolve/2f6c96314dc84dfda62d40de9da55f2f5165d403/invoice.png"
+)
+@is_pipeline_test
+@require_torch
+@require_vision
+class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
+    @require_pytesseract
+    @require_vision
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        dqa_pipeline = pipeline(
+            "document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+        )
+        image = INVOICE_URL
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+        question = "What is the placebo?"
+        examples = [
+            {
+                "image": load_image(image),
+                "question": question,
+            },
+            {
+                "image": image,
+                "question": question,
+            },
+            {
+                "image": image,
+                "question": question,
+                "word_boxes": word_boxes,
+            },
+            {
+                "image": None,
+                "question": question,
+                "word_boxes": word_boxes,
+            },
+        ]
+        return dqa_pipeline, examples
+    def run_pipeline_test(self, dqa_pipeline, examples):
+        outputs = dqa_pipeline(examples, top_k=2)
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
+                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
+                ]
+            ]
+            * 4,
+        )
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_small_model_pt(self):
+        dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-layoutlmv2")
+        image = INVOICE_URL
+        question = "How many cats are there?"
+        expected_output = [
+            {
+                "score": 0.0001,
+                "answer": "2312/2019 DUE DATE 26102/2019 ay DESCRIPTION UNIT PRICE",
+                "start": 38,
+                "end": 45,
+            },
+            {"score": 0.0001, "answer": "2312/2019 DUE", "start": 38, "end": 39},
+        ]
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
+        # This image does not detect ANY text in it, meaning layoutlmv2 should fail.
+        # Empty answer probably
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(outputs, [])
+        # We can optionnally pass directly the words and bounding boxes
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        words = []
+        boxes = []
+        outputs = dqa_pipeline(image=image, question=question, words=words, boxes=boxes, top_k=2)
+        self.assertEqual(outputs, [])
+    # 	 TODO: Enable this once hf-internal-testing/tiny-random-donut is implemented
+    #    @require_torch
+    #    def test_small_model_pt_donut(self):
+    #        dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-donut")
+    #        # dqa_pipeline = pipeline("document-question-answering", model="../tiny-random-donut")
+    #        image = "https://templates.invoicehome.com/invoice-template-us-neat-750px.png"
+    #        question = "How many cats are there?"
+    #
+    #        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+    #        self.assertEqual(
+    #            nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]
+    #        )
+    @slow
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_large_model_pt(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
+            revision="9977165",
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9966, "answer": "us-001", "start": 15, "end": 15},
+                    {"score": 0.0009, "answer": "us-001", "start": 15, "end": 15},
+                ],
+            ]
+            * 2,
+        )
+    @slow
+    @require_torch
+    @require_pytesseract
+    @require_vision
+    def test_large_model_pt_layoutlm(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
+        )
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+            tokenizer=tokenizer,
+            revision="3dc6de3",
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
+            ],
+        )
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
+            ],
+        )
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
+                    {"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
+                ]
+            ]
+            * 2,
+        )
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+        # This model should also work if `image` is set to None
+        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9998, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.0, "answer": "INVOICE # us-001", "start": 13, "end": 15},
+            ],
+        )
+    @slow
+    @require_torch
+    def test_large_model_pt_donut(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="naver-clova-ix/donut-base-finetuned-docvqa",
+            tokenizer=AutoTokenizer.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa"),
+            feature_extractor="naver-clova-ix/donut-base-finetuned-docvqa",
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(nested_simplify(outputs, decimals=4), {"answer": "us-001"})
+    @require_tf
+    @unittest.skip("Document question answering not implemented in TF")
+    def test_small_model_tf(self):
+        pass
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -89,6 +89,7 @@ if is_torch_available():
        MODEL_FOR_AUDIO_XVECTOR_MAPPING,
        MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
        MODEL_FOR_CAUSAL_LM_MAPPING,
+        MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
        MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
        MODEL_FOR_MASKED_LM_MAPPING,
@@ -172,7 +173,10 @@ class ModelTesterMixin:
        if return_labels:
            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+            elif model_class in [
+                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
+                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+            ]:
                inputs_dict["start_positions"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
                )
@@ -542,7 +546,10 @@ class ModelTesterMixin:
                if "labels" in inputs_dict:
                    correct_outlen += 1  # loss is added to beginning
                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+                if model_class in [
+                    *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING),
+                    *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+                ]:
                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                if "past_key_values" in outputs:
                    correct_outlen += 1  # past_key_values have been returned

--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -61,6 +61,7 @@ if is_tf_available():
    from transformers import (
        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
        TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
        TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
        TF_MODEL_FOR_MASKED_LM_MAPPING,
@@ -149,7 +150,10 @@ class TFModelTesterMixin:
        if return_labels:
            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
+            elif model_class in [
+                *get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING),
+                *get_values(TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING),
+            ]:
                inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
                inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
            elif model_class in [