v

f9b1a89a · HHL · 60e27226 · f9b1a89a · f9b1a89a · f9b1a89a
Commit f9b1a89a authored Dec 27, 2023 by HHL
20 changed files
--- a/.gitignore
+++ b/.gitignore
+.vscode/*
+pretrained_model/*
\ No newline at end of file
--- a/README.md
+++ b/README.md
-# GraphDoc_pytorch
+# GraphDoc
+The source code for [Multimodal Pre-training Based on Graph Attention Network for Document Understanding](https://arxiv.org/abs/2203.13530).
+## Requirements
+* torch==1.7.1
+* mmdet==2.16.0
+* transformers==4.6.0
+## Pretrained-Model
+We provide pretrained model required for downstream tasks, the download link is https://rec.ustc.edu.cn/share/031c0580-0366-11ed-bb15-47281881a56b.
+The user should unzip the pretrained model to the base folder, formed as graphdoc/pretrained_model.
+## Usage
+We provide a example code for extracting document representation with GraphDoc in the runner/graphdoc/encode_document.py
+## Citation
+If you find GraphDoc useful in your research, please consider citing:
+    @article{zrzhang2022graphdoc,
+        author = {Zhang, Zhenrong and Ma, Jiefeng and Du, Jun and Wang, Licheng and Zhang, Jianshu},
+        title = {Multimodal Pre-training Based on Graph Attention Network for Document Understanding},
+        journal = {arXiv},
+        year = {2022},
+        volume={abs/2203.13530}
+    }
+## Contact
+zzr666@mail.ustc.edu.cn<br>
\ No newline at end of file
--- a/layoutlmft/__init__.py
+++ b/layoutlmft/__init__.py
+from collections import OrderedDict
+from transformers import CONFIG_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_NAMES_MAPPING, TOKENIZER_MAPPING
+from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, BertConverter, XLMRobertaConverter
+from transformers.models.auto.modeling_auto import auto_class_factory
+from .models.layoutlmv2 import (
+    LayoutLMv2Config,
+    LayoutLMv2ForRelationExtraction,
+    LayoutLMv2ForTokenClassification,
+    LayoutLMv2Tokenizer,
+    LayoutLMv2TokenizerFast,
+)
+from .models.layoutxlm import (
+    LayoutXLMConfig,
+    LayoutXLMForRelationExtraction,
+    LayoutXLMForTokenClassification,
+    LayoutXLMTokenizer,
+    LayoutXLMTokenizerFast,
+)
+from .models.graphdoc import (
+    GraphDocConfig, 
+    GraphDocForTokenClassification
+)
+CONFIG_MAPPING.update([("layoutlmv2", LayoutLMv2Config), ("layoutxlm", LayoutXLMConfig), ("graphdoc", GraphDocConfig)])
+MODEL_NAMES_MAPPING.update([("layoutlmv2", "LayoutLMv2"), ("layoutxlm", "LayoutXLM"), ("graphdoc", "GraphDoc")])
+TOKENIZER_MAPPING.update(
+    [
+        (LayoutLMv2Config, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)),
+        (LayoutXLMConfig, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
+    ]
+)
+SLOW_TO_FAST_CONVERTERS.update(
+    {
+        "LayoutLMv2Tokenizer": BertConverter, 
+        "LayoutXLMConverter": XLMRobertaConverter
+    }
+)
+MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.update(
+    [
+        (LayoutLMv2Config, LayoutLMv2ForTokenClassification), 
+        (LayoutXLMConfig, LayoutXLMForTokenClassification),
+        (GraphDocConfig, GraphDocForTokenClassification)
+    ]
+)
+MODEL_FOR_RELATION_EXTRACTION_MAPPING = OrderedDict([
+    (LayoutLMv2Config, LayoutLMv2ForRelationExtraction), 
+    (LayoutXLMConfig, LayoutXLMForRelationExtraction)
+])
+AutoModelForTokenClassification = auto_class_factory(
+    "AutoModelForTokenClassification", MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, head_doc="token classification"
+)
+AutoModelForRelationExtraction = auto_class_factory(
+    "AutoModelForRelationExtraction", MODEL_FOR_RELATION_EXTRACTION_MAPPING, head_doc="relation extraction"
+)
--- a/layoutlmft/__pycache__/__init__.cpython-37.pyc
+++ b/layoutlmft/__pycache__/__init__.cpython-37.pyc
--- a/layoutlmft/__pycache__/__init__.cpython-38.pyc
+++ b/layoutlmft/__pycache__/__init__.cpython-38.pyc
--- a/layoutlmft/__pycache__/utils.cpython-37.pyc
+++ b/layoutlmft/__pycache__/utils.cpython-37.pyc
--- a/layoutlmft/__pycache__/utils.cpython-38.pyc
+++ b/layoutlmft/__pycache__/utils.cpython-38.pyc
--- a/layoutlmft/data/__init__.py
+++ b/layoutlmft/data/__init__.py
+# flake8: noqa
+from .data_collator import DataCollatorForKeyValueExtraction
+from .datasets import *
--- a/layoutlmft/data/__pycache__/__init__.cpython-37.pyc
+++ b/layoutlmft/data/__pycache__/__init__.cpython-37.pyc
--- a/layoutlmft/data/__pycache__/__init__.cpython-38.pyc
+++ b/layoutlmft/data/__pycache__/__init__.cpython-38.pyc
--- a/layoutlmft/data/__pycache__/data_args.cpython-37.pyc
+++ b/layoutlmft/data/__pycache__/data_args.cpython-37.pyc
--- a/layoutlmft/data/__pycache__/data_args.cpython-38.pyc
+++ b/layoutlmft/data/__pycache__/data_args.cpython-38.pyc
--- a/layoutlmft/data/__pycache__/data_collator.cpython-37.pyc
+++ b/layoutlmft/data/__pycache__/data_collator.cpython-37.pyc
--- a/layoutlmft/data/__pycache__/data_collator.cpython-38.pyc
+++ b/layoutlmft/data/__pycache__/data_collator.cpython-38.pyc
--- a/layoutlmft/data/__pycache__/utils.cpython-37.pyc
+++ b/layoutlmft/data/__pycache__/utils.cpython-37.pyc
--- a/layoutlmft/data/__pycache__/utils.cpython-38.pyc
+++ b/layoutlmft/data/__pycache__/utils.cpython-38.pyc
--- a/layoutlmft/data/data_args.py
+++ b/layoutlmft/data/data_args.py
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to model maximum sentence length. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+            "efficient on GPU but very bad for TPU."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_val_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "value if set."
+        },
+    )
+    max_test_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "value if set."
+        },
+    )
+    label_all_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
+            "one (in which case the other tokens will have a padding index)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
+    )
+@dataclass
+class XFUNDataTrainingArguments(DataTrainingArguments):
+    lang: Optional[str] = field(default="en")
+    additional_langs: Optional[str] = field(default=None)
--- a/layoutlmft/data/data_collator.py
+++ b/layoutlmft/data/data_collator.py
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+from detectron2.structures import ImageList
+from transformers import PreTrainedTokenizerBase
+from transformers.file_utils import PaddingStrategy
+@dataclass
+class DataCollatorForKeyValueExtraction:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+    """
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    def __call__(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        has_image_input = "image" in features[0]
+        has_bbox_input = "bbox" in features[0]
+        if has_image_input:
+            image = ImageList.from_tensors([torch.tensor(feature["image"]) for feature in features], 32)
+            for feature in features:
+                del feature["image"]
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="pt" if labels is None else None,
+        )
+        if labels is None:
+            return batch
+        sequence_length = torch.tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]]
+        else:
+            batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]]
+        batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()}
+        if has_image_input:
+            batch["image"] = image
+        return batch
--- a/layoutlmft/data/datasets/__init__.py
+++ b/layoutlmft/data/datasets/__init__.py
--- a/layoutlmft/data/datasets/__pycache__/__init__.cpython-37.pyc
+++ b/layoutlmft/data/datasets/__pycache__/__init__.cpython-37.pyc