"mmdet3d/models/layers/sparse_block.py" did not exist on "c28a3d98373f99a12760b5c94d084b6204632f08"
Commit f9b1a89a authored by HHL's avatar HHL
Browse files

v

parent 60e27226
# coding=utf-8
import json
import os
import datasets
from layoutlmft.data.utils import load_image, normalize_bbox
logger = datasets.logging.get_logger(__name__)
_CITATION = """\
@article{Jaume2019FUNSDAD,
title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
year={2019},
volume={2},
pages={1-6}
}
"""
_DESCRIPTION = """\
https://guillaumejaume.github.io/FUNSD/
"""
class FunsdConfig(datasets.BuilderConfig):
"""BuilderConfig for FUNSD"""
def __init__(self, **kwargs):
"""BuilderConfig for FUNSD.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(FunsdConfig, self).__init__(**kwargs)
class Funsd(datasets.GeneratorBasedBuilder):
"""Conll2003 dataset."""
BUILDER_CONFIGS = [
FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"),
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"),
"tokens": datasets.Sequence(datasets.Value("string")),
"bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
"ner_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
)
),
"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
}
),
supervised_keys=None,
homepage="https://guillaumejaume.github.io/FUNSD/",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip")
downloaded_file = '/yrfs1/intern/zrzhang6/DocumentPretrain/dataset/funsd'
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"}
),
]
def _generate_examples(self, filepath):
logger.info("⏳ Generating examples from = %s", filepath)
ann_dir = os.path.join(filepath, "annotations")
img_dir = os.path.join(filepath, "images")
for guid, file in enumerate(sorted(os.listdir(ann_dir))):
tokens = []
bboxes = []
ner_tags = []
file_path = os.path.join(ann_dir, file)
with open(file_path, "r", encoding="utf8") as f:
data = json.load(f)
image_path = os.path.join(img_dir, file)
image_path = image_path.replace("json", "png")
image, size = load_image(image_path)
for item in data["form"]:
words, label = item["words"], item["label"]
words = [w for w in words if w["text"].strip() != ""]
if len(words) == 0:
continue
if label == "other":
for w in words:
tokens.append(w["text"])
ner_tags.append("O")
bboxes.append(normalize_bbox(w["box"], size))
else:
tokens.append(words[0]["text"])
ner_tags.append("B-" + label.upper())
bboxes.append(normalize_bbox(words[0]["box"], size))
for w in words[1:]:
tokens.append(w["text"])
ner_tags.append("I-" + label.upper())
bboxes.append(normalize_bbox(w["box"], size))
yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, "image": image}
# Lint as: python3
import json
import logging
import os
import datasets
from layoutlmft.data.utils import load_image, merge_bbox, normalize_bbox, simplify_bbox
from transformers import AutoTokenizer
_URL = "https://github.com/doc-analysis/XFUN/releases/download/v1.0/"
_LANG = ["zh", "de", "es", "fr", "en", "it", "ja", "pt"]
logger = logging.getLogger(__name__)
class XFUNConfig(datasets.BuilderConfig):
"""BuilderConfig for XFUN."""
def __init__(self, lang, additional_langs=None, **kwargs):
"""
Args:
lang: string, language for the input text
**kwargs: keyword arguments forwarded to super.
"""
super(XFUNConfig, self).__init__(**kwargs)
self.lang = lang
self.additional_langs = additional_langs
class XFUN(datasets.GeneratorBasedBuilder):
"""XFUN dataset."""
BUILDER_CONFIGS = [XFUNConfig(name=f"xfun.{lang}", lang=lang) for lang in _LANG]
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
def _info(self):
return datasets.DatasetInfo(
features=datasets.Features(
{
"id": datasets.Value("string"),
"input_ids": datasets.Sequence(datasets.Value("int64")),
"bbox": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
"labels": datasets.Sequence(
datasets.ClassLabel(
names=["O", "B-QUESTION", "B-ANSWER", "B-HEADER", "I-ANSWER", "I-QUESTION", "I-HEADER"]
)
),
"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
"entities": datasets.Sequence(
{
"start": datasets.Value("int64"),
"end": datasets.Value("int64"),
"label": datasets.ClassLabel(names=["HEADER", "QUESTION", "ANSWER"]),
}
),
"relations": datasets.Sequence(
{
"head": datasets.Value("int64"),
"tail": datasets.Value("int64"),
"start_index": datasets.Value("int64"),
"end_index": datasets.Value("int64"),
}
),
}
),
supervised_keys=None,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
urls_to_download = {
"train": [f"{_URL}{self.config.lang}.train.json", f"{_URL}{self.config.lang}.train.zip"],
"val": [f"{_URL}{self.config.lang}.val.json", f"{_URL}{self.config.lang}.val.zip"],
# "test": [f"{_URL}{self.config.lang}.test.json", f"{_URL}{self.config.lang}.test.zip"],
}
downloaded_files = dl_manager.download_and_extract(urls_to_download)
train_files_for_many_langs = [downloaded_files["train"]]
val_files_for_many_langs = [downloaded_files["val"]]
# test_files_for_many_langs = [downloaded_files["test"]]
if self.config.additional_langs:
additional_langs = self.config.additional_langs.split("+")
if "all" in additional_langs:
additional_langs = [lang for lang in _LANG if lang != self.config.lang]
for lang in additional_langs:
urls_to_download = {"train": [f"{_URL}{lang}.train.json", f"{_URL}{lang}.train.zip"]}
additional_downloaded_files = dl_manager.download_and_extract(urls_to_download)
train_files_for_many_langs.append(additional_downloaded_files["train"])
logger.info(f"Training on {self.config.lang} with additional langs({self.config.additional_langs})")
logger.info(f"Evaluating on {self.config.lang}")
logger.info(f"Testing on {self.config.lang}")
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_files_for_many_langs}),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": val_files_for_many_langs}
),
# datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": test_files_for_many_langs}),
]
def _generate_examples(self, filepaths):
for filepath in filepaths:
logger.info("Generating examples from = %s", filepath)
with open(filepath[0], "r") as f:
data = json.load(f)
for doc in data["documents"]:
doc["img"]["fpath"] = os.path.join(filepath[1], doc["img"]["fname"])
image, size = load_image(doc["img"]["fpath"])
document = doc["document"]
tokenized_doc = {"input_ids": [], "bbox": [], "labels": []}
entities = []
relations = []
id2label = {}
entity_id_to_index_map = {}
empty_entity = set()
for line in document:
if len(line["text"]) == 0:
empty_entity.add(line["id"])
continue
id2label[line["id"]] = line["label"]
relations.extend([tuple(sorted(l)) for l in line["linking"]])
tokenized_inputs = self.tokenizer(
line["text"],
add_special_tokens=False,
return_offsets_mapping=True,
return_attention_mask=False,
)
text_length = 0
ocr_length = 0
bbox = []
for token_id, offset in zip(tokenized_inputs["input_ids"], tokenized_inputs["offset_mapping"]):
if token_id == 6:
bbox.append(None)
continue
text_length += offset[1] - offset[0]
tmp_box = []
while ocr_length < text_length:
ocr_word = line["words"].pop(0)
ocr_length += len(
self.tokenizer._tokenizer.normalizer.normalize_str(ocr_word["text"].strip())
)
tmp_box.append(simplify_bbox(ocr_word["box"]))
if len(tmp_box) == 0:
tmp_box = last_box
bbox.append(normalize_bbox(merge_bbox(tmp_box), size))
last_box = tmp_box # noqa
bbox = [
[bbox[i + 1][0], bbox[i + 1][1], bbox[i + 1][0], bbox[i + 1][1]] if b is None else b
for i, b in enumerate(bbox)
]
if line["label"] == "other":
label = ["O"] * len(bbox)
else:
label = [f"I-{line['label'].upper()}"] * len(bbox)
label[0] = f"B-{line['label'].upper()}"
tokenized_inputs.update({"bbox": bbox, "labels": label})
if label[0] != "O":
entity_id_to_index_map[line["id"]] = len(entities)
entities.append(
{
"start": len(tokenized_doc["input_ids"]),
"end": len(tokenized_doc["input_ids"]) + len(tokenized_inputs["input_ids"]),
"label": line["label"].upper(),
}
)
for i in tokenized_doc:
tokenized_doc[i] = tokenized_doc[i] + tokenized_inputs[i]
relations = list(set(relations))
relations = [rel for rel in relations if rel[0] not in empty_entity and rel[1] not in empty_entity]
kvrelations = []
for rel in relations:
pair = [id2label[rel[0]], id2label[rel[1]]]
if pair == ["question", "answer"]:
kvrelations.append(
{"head": entity_id_to_index_map[rel[0]], "tail": entity_id_to_index_map[rel[1]]}
)
elif pair == ["answer", "question"]:
kvrelations.append(
{"head": entity_id_to_index_map[rel[1]], "tail": entity_id_to_index_map[rel[0]]}
)
else:
continue
def get_relation_span(rel):
bound = []
for entity_index in [rel["head"], rel["tail"]]:
bound.append(entities[entity_index]["start"])
bound.append(entities[entity_index]["end"])
return min(bound), max(bound)
relations = sorted(
[
{
"head": rel["head"],
"tail": rel["tail"],
"start_index": get_relation_span(rel)[0],
"end_index": get_relation_span(rel)[1],
}
for rel in kvrelations
],
key=lambda x: x["head"],
)
chunk_size = 512
for chunk_id, index in enumerate(range(0, len(tokenized_doc["input_ids"]), chunk_size)):
item = {}
for k in tokenized_doc:
item[k] = tokenized_doc[k][index : index + chunk_size]
entities_in_this_span = []
global_to_local_map = {}
for entity_id, entity in enumerate(entities):
if (
index <= entity["start"] < index + chunk_size
and index <= entity["end"] < index + chunk_size
):
entity["start"] = entity["start"] - index
entity["end"] = entity["end"] - index
global_to_local_map[entity_id] = len(entities_in_this_span)
entities_in_this_span.append(entity)
relations_in_this_span = []
for relation in relations:
if (
index <= relation["start_index"] < index + chunk_size
and index <= relation["end_index"] < index + chunk_size
):
relations_in_this_span.append(
{
"head": global_to_local_map[relation["head"]],
"tail": global_to_local_map[relation["tail"]],
"start_index": relation["start_index"] - index,
"end_index": relation["end_index"] - index,
}
)
item.update(
{
"id": f"{doc['id']}_{chunk_id}",
"image": image,
"entities": entities_in_this_span,
"relations": relations_in_this_span,
}
)
yield f"{doc['id']}_{chunk_id}", item
import torch
from detectron2.data.detection_utils import read_image
from detectron2.data.transforms import ResizeTransform, TransformList
def normalize_bbox(bbox, size):
return [
int(1000 * bbox[0] / size[0]),
int(1000 * bbox[1] / size[1]),
int(1000 * bbox[2] / size[0]),
int(1000 * bbox[3] / size[1]),
]
def simplify_bbox(bbox):
return [
min(bbox[0::2]),
min(bbox[1::2]),
max(bbox[2::2]),
max(bbox[3::2]),
]
def merge_bbox(bbox_list):
x0, y0, x1, y1 = list(zip(*bbox_list))
return [min(x0), min(y0), max(x1), max(y1)]
def load_image(image_path):
image = read_image(image_path, format="BGR")
h = image.shape[0]
w = image.shape[1]
img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)])
image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable
return image, (w, h)
import os
import re
import numpy as np
from transformers.utils import logging
logger = logging.get_logger(__name__)
PREFIX_CHECKPOINT_DIR = "checkpoint"
_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")
def get_last_checkpoint(folder):
content = os.listdir(folder)
checkpoints = [
path
for path in content
if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
]
if len(checkpoints) == 0:
return
return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0])))
def re_score(pred_relations, gt_relations, mode="strict"):
"""Evaluate RE predictions
Args:
pred_relations (list) : list of list of predicted relations (several relations in each sentence)
gt_relations (list) : list of list of ground truth relations
rel = { "head": (start_idx (inclusive), end_idx (exclusive)),
"tail": (start_idx (inclusive), end_idx (exclusive)),
"head_type": ent_type,
"tail_type": ent_type,
"type": rel_type}
vocab (Vocab) : dataset vocabulary
mode (str) : in 'strict' or 'boundaries'"""
assert mode in ["strict", "boundaries"]
relation_types = [v for v in [0, 1] if not v == 0]
scores = {rel: {"tp": 0, "fp": 0, "fn": 0} for rel in relation_types + ["ALL"]}
# Count GT relations and Predicted relations
n_sents = len(gt_relations)
n_rels = sum([len([rel for rel in sent]) for sent in gt_relations])
n_found = sum([len([rel for rel in sent]) for sent in pred_relations])
# Count TP, FP and FN per type
for pred_sent, gt_sent in zip(pred_relations, gt_relations):
for rel_type in relation_types:
# strict mode takes argument types into account
if mode == "strict":
pred_rels = {
(rel["head"], rel["head_type"], rel["tail"], rel["tail_type"])
for rel in pred_sent
if rel["type"] == rel_type
}
gt_rels = {
(rel["head"], rel["head_type"], rel["tail"], rel["tail_type"])
for rel in gt_sent
if rel["type"] == rel_type
}
# boundaries mode only takes argument spans into account
elif mode == "boundaries":
pred_rels = {(rel["head"], rel["tail"]) for rel in pred_sent if rel["type"] == rel_type}
gt_rels = {(rel["head"], rel["tail"]) for rel in gt_sent if rel["type"] == rel_type}
scores[rel_type]["tp"] += len(pred_rels & gt_rels)
scores[rel_type]["fp"] += len(pred_rels - gt_rels)
scores[rel_type]["fn"] += len(gt_rels - pred_rels)
# Compute per entity Precision / Recall / F1
for rel_type in scores.keys():
if scores[rel_type]["tp"]:
scores[rel_type]["p"] = scores[rel_type]["tp"] / (scores[rel_type]["fp"] + scores[rel_type]["tp"])
scores[rel_type]["r"] = scores[rel_type]["tp"] / (scores[rel_type]["fn"] + scores[rel_type]["tp"])
else:
scores[rel_type]["p"], scores[rel_type]["r"] = 0, 0
if not scores[rel_type]["p"] + scores[rel_type]["r"] == 0:
scores[rel_type]["f1"] = (
2 * scores[rel_type]["p"] * scores[rel_type]["r"] / (scores[rel_type]["p"] + scores[rel_type]["r"])
)
else:
scores[rel_type]["f1"] = 0
# Compute micro F1 Scores
tp = sum([scores[rel_type]["tp"] for rel_type in relation_types])
fp = sum([scores[rel_type]["fp"] for rel_type in relation_types])
fn = sum([scores[rel_type]["fn"] for rel_type in relation_types])
if tp:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
else:
precision, recall, f1 = 0, 0, 0
scores["ALL"]["p"] = precision
scores["ALL"]["r"] = recall
scores["ALL"]["f1"] = f1
scores["ALL"]["tp"] = tp
scores["ALL"]["fp"] = fp
scores["ALL"]["fn"] = fn
# Compute Macro F1 Scores
scores["ALL"]["Macro_f1"] = np.mean([scores[ent_type]["f1"] for ent_type in relation_types])
scores["ALL"]["Macro_p"] = np.mean([scores[ent_type]["p"] for ent_type in relation_types])
scores["ALL"]["Macro_r"] = np.mean([scores[ent_type]["r"] for ent_type in relation_types])
logger.info(f"RE Evaluation in *** {mode.upper()} *** mode")
logger.info(
"processed {} sentences with {} relations; found: {} relations; correct: {}.".format(
n_sents, n_rels, n_found, tp
)
)
logger.info(
"\tALL\t TP: {};\tFP: {};\tFN: {}".format(scores["ALL"]["tp"], scores["ALL"]["fp"], scores["ALL"]["fn"])
)
logger.info("\t\t(m avg): precision: {:.2f};\trecall: {:.2f};\tf1: {:.2f} (micro)".format(precision, recall, f1))
logger.info(
"\t\t(M avg): precision: {:.2f};\trecall: {:.2f};\tf1: {:.2f} (Macro)\n".format(
scores["ALL"]["Macro_p"], scores["ALL"]["Macro_r"], scores["ALL"]["Macro_f1"]
)
)
for rel_type in relation_types:
logger.info(
"\t{}: \tTP: {};\tFP: {};\tFN: {};\tprecision: {:.2f};\trecall: {:.2f};\tf1: {:.2f};\t{}".format(
rel_type,
scores[rel_type]["tp"],
scores[rel_type]["fp"],
scores[rel_type]["fn"],
scores[rel_type]["p"],
scores[rel_type]["r"],
scores[rel_type]["f1"],
scores[rel_type]["tp"] + scores[rel_type]["fp"],
)
)
return scores
from .configuration_graphdoc import GraphDocConfig
from .modeling_graphdoc import GraphDocForTokenClassification, GraphDocModel, GraphDocForPretrain
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment