Commit f9b1a89a authored by HHL's avatar HHL
Browse files

v

parent 60e27226
.vscode/*
pretrained_model/*
\ No newline at end of file
# GraphDoc_pytorch # GraphDoc
The source code for [Multimodal Pre-training Based on Graph Attention Network for Document Understanding](https://arxiv.org/abs/2203.13530).
## Requirements
* torch==1.7.1
* mmdet==2.16.0
* transformers==4.6.0
## Pretrained-Model
We provide pretrained model required for downstream tasks, the download link is https://rec.ustc.edu.cn/share/031c0580-0366-11ed-bb15-47281881a56b.
The user should unzip the pretrained model to the base folder, formed as graphdoc/pretrained_model.
## Usage
We provide a example code for extracting document representation with GraphDoc in the runner/graphdoc/encode_document.py
## Citation
If you find GraphDoc useful in your research, please consider citing:
@article{zrzhang2022graphdoc,
author = {Zhang, Zhenrong and Ma, Jiefeng and Du, Jun and Wang, Licheng and Zhang, Jianshu},
title = {Multimodal Pre-training Based on Graph Attention Network for Document Understanding},
journal = {arXiv},
year = {2022},
volume={abs/2203.13530}
}
## Contact
zzr666@mail.ustc.edu.cn<br>
\ No newline at end of file
from collections import OrderedDict
from transformers import CONFIG_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_NAMES_MAPPING, TOKENIZER_MAPPING
from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, BertConverter, XLMRobertaConverter
from transformers.models.auto.modeling_auto import auto_class_factory
from .models.layoutlmv2 import (
LayoutLMv2Config,
LayoutLMv2ForRelationExtraction,
LayoutLMv2ForTokenClassification,
LayoutLMv2Tokenizer,
LayoutLMv2TokenizerFast,
)
from .models.layoutxlm import (
LayoutXLMConfig,
LayoutXLMForRelationExtraction,
LayoutXLMForTokenClassification,
LayoutXLMTokenizer,
LayoutXLMTokenizerFast,
)
from .models.graphdoc import (
GraphDocConfig,
GraphDocForTokenClassification
)
CONFIG_MAPPING.update([("layoutlmv2", LayoutLMv2Config), ("layoutxlm", LayoutXLMConfig), ("graphdoc", GraphDocConfig)])
MODEL_NAMES_MAPPING.update([("layoutlmv2", "LayoutLMv2"), ("layoutxlm", "LayoutXLM"), ("graphdoc", "GraphDoc")])
TOKENIZER_MAPPING.update(
[
(LayoutLMv2Config, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)),
(LayoutXLMConfig, (LayoutXLMTokenizer, LayoutXLMTokenizerFast))
]
)
SLOW_TO_FAST_CONVERTERS.update(
{
"LayoutLMv2Tokenizer": BertConverter,
"LayoutXLMConverter": XLMRobertaConverter
}
)
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.update(
[
(LayoutLMv2Config, LayoutLMv2ForTokenClassification),
(LayoutXLMConfig, LayoutXLMForTokenClassification),
(GraphDocConfig, GraphDocForTokenClassification)
]
)
MODEL_FOR_RELATION_EXTRACTION_MAPPING = OrderedDict([
(LayoutLMv2Config, LayoutLMv2ForRelationExtraction),
(LayoutXLMConfig, LayoutXLMForRelationExtraction)
])
AutoModelForTokenClassification = auto_class_factory(
"AutoModelForTokenClassification", MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, head_doc="token classification"
)
AutoModelForRelationExtraction = auto_class_factory(
"AutoModelForRelationExtraction", MODEL_FOR_RELATION_EXTRACTION_MAPPING, head_doc="relation extraction"
)
# flake8: noqa
from .data_collator import DataCollatorForKeyValueExtraction
from .datasets import *
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_file: Optional[str] = field(
default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
)
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
)
test_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
pad_to_max_length: bool = field(
default=True,
metadata={
"help": "Whether to pad all samples to model maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
"efficient on GPU but very bad for TPU."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
max_test_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of test examples to this "
"value if set."
},
)
label_all_tokens: bool = field(
default=False,
metadata={
"help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
"one (in which case the other tokens will have a padding index)."
},
)
return_entity_level_metrics: bool = field(
default=False,
metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
)
@dataclass
class XFUNDataTrainingArguments(DataTrainingArguments):
lang: Optional[str] = field(default="en")
additional_langs: Optional[str] = field(default=None)
from dataclasses import dataclass
from typing import Optional, Union
import torch
from detectron2.structures import ImageList
from transformers import PreTrainedTokenizerBase
from transformers.file_utils import PaddingStrategy
@dataclass
class DataCollatorForKeyValueExtraction:
"""
Data collator that will dynamically pad the inputs received, as well as the labels.
Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
"""
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100
def __call__(self, features):
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
has_image_input = "image" in features[0]
has_bbox_input = "bbox" in features[0]
if has_image_input:
image = ImageList.from_tensors([torch.tensor(feature["image"]) for feature in features], 32)
for feature in features:
del feature["image"]
batch = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
# Conversion to tensors will fail if we have labels as they are not of the same length yet.
return_tensors="pt" if labels is None else None,
)
if labels is None:
return batch
sequence_length = torch.tensor(batch["input_ids"]).shape[1]
padding_side = self.tokenizer.padding_side
if padding_side == "right":
batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
if has_bbox_input:
batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]]
else:
batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
if has_bbox_input:
batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]]
batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()}
if has_image_input:
batch["image"] = image
return batch
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment