Commit d01acab4 authored by 徐超's avatar 徐超
Browse files

Merge branch 'master' of github.com:opendatalab/MinerU

parents 9ec91339 2cb82b7f
...@@ -36,9 +36,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -36,9 +36,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes = fix_text_overlap_title_blocks(all_bboxes) all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框''' '''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# @todo interline_equation 与title或text框冲突的情况,分两种情况处理
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框''' '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks: for discarded in discarded_blocks:
...@@ -57,6 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -57,6 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return all_bboxes, all_discarded_blocks, drop_reasons return all_bboxes, all_discarded_blocks, drop_reasons
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
# 先提取所有text和interline block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
interline_equation_blocks = []
for block in all_bboxes:
if block[7] == BlockType.InterlineEquation:
interline_equation_blocks.append(block)
need_remove = []
for interline_equation_block in interline_equation_blocks:
for text_block in text_blocks:
interline_equation_block_bbox = interline_equation_block[:4]
text_block_bbox = text_block[:4]
if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
if text_block not in need_remove:
need_remove.append(text_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def fix_text_overlap_title_blocks(all_bboxes): def fix_text_overlap_title_blocks(all_bboxes):
# 先提取所有text和title block # 先提取所有text和title block
text_blocks = [] text_blocks = []
...@@ -68,12 +99,19 @@ def fix_text_overlap_title_blocks(all_bboxes): ...@@ -68,12 +99,19 @@ def fix_text_overlap_title_blocks(all_bboxes):
if block[7] == BlockType.Title: if block[7] == BlockType.Title:
title_blocks.append(block) title_blocks.append(block)
need_remove = []
for text_block in text_blocks: for text_block in text_blocks:
for title_block in title_blocks: for title_block in title_blocks:
text_block_bbox = text_block[:4] text_block_bbox = text_block[:4]
title_block_bbox = title_block[:4] title_block_bbox = title_block[:4]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8: if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
all_bboxes.remove(title_block) if title_block not in need_remove:
need_remove.append(title_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes return all_bboxes
......
...@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g ...@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_low_confidence_spans(spans): def remove_overlaps_low_confidence_spans(spans):
dropped_spans = [] dropped_spans = []
# 删除重叠spans中置信度低的的那些 # 删除重叠spans中置信度低的的那些
for span1 in spans: for span1 in spans:
for span2 in spans: for span2 in spans:
if span1 != span2: if span1 != span2:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9: # span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1['score'] < span2['score']: if span1 in dropped_spans or span2 in dropped_spans:
span_need_remove = span1 continue
else: else:
span_need_remove = span2 if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span_need_remove is not None and span_need_remove not in dropped_spans: if span1['score'] < span2['score']:
dropped_spans.append(span_need_remove) span_need_remove = span1
else:
span_need_remove = span2
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0: if len(dropped_spans) > 0:
for span_need_remove in dropped_spans: for span_need_remove in dropped_spans:
......
AUG: AUG:
DETR: true DETR: true
CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false CUDNN_BENCHMARK: false
DATALOADER: DATALOADER:
ASPECT_RATIO_GROUPING: true ASPECT_RATIO_GROUPING: true
...@@ -294,7 +294,7 @@ MODEL: ...@@ -294,7 +294,7 @@ MODEL:
POS_TYPE: abs POS_TYPE: abs
WEIGHTS: WEIGHTS:
OUTPUT_DIR: OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42 SEED: 42
SOLVER: SOLVER:
AMP: AMP:
......
...@@ -7,5 +7,5 @@ numpy>=1.21.6 ...@@ -7,5 +7,5 @@ numpy>=1.21.6
fast-langdetect>=0.2.1 fast-langdetect>=0.2.1
wordninja>=2.0.0 wordninja>=2.0.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
pdfminer.six>=20231228 pdfminer.six==20231228
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
...@@ -32,9 +32,8 @@ if __name__ == '__main__': ...@@ -32,9 +32,8 @@ if __name__ == '__main__':
}, },
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库 install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
extras_require={ extras_require={
"gpu": ["paddleocr==2.7.3", "paddlepaddle-gpu"], "lite": ["paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
"cpu": ["paddleocr==2.7.3", "paddlepaddle"], "full": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle", "paddlepaddle-gpu"],
"full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle"],
}, },
description="A practical tool for converting PDF to Markdown", # 简短描述 description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述 long_description=long_description, # 详细描述
......
{
"signedContributors": [
]
}
\ No newline at end of file
...@@ -6,7 +6,7 @@ import json ...@@ -6,7 +6,7 @@ import json
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from lib import calculate_score from lib import calculate_score
import shutil
pdf_res_path = conf.conf["pdf_res_path"] pdf_res_path = conf.conf["pdf_res_path"]
code_path = conf.conf["code_path"] code_path = conf.conf["code_path"]
pdf_dev_path = conf.conf["pdf_dev_path"] pdf_dev_path = conf.conf["pdf_dev_path"]
...@@ -58,8 +58,8 @@ def pdf_to_markdown(): ...@@ -58,8 +58,8 @@ def pdf_to_markdown():
if not os.path.exists(dir_path): if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True) os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f"{demo_name}.md") res_path = os.path.join(dir_path, f"{demo_name}.md")
#src_path = os.path.join(pdf_res_path, "pdf", f"{demo_name}.pdf") src_path = os.path.join(pdf_res_path, demo_name, "auto", f"{demo_name}.md")
#shutil.copy(src_path, res_path) shutil.copy(src_path, res_path)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment