Unverified Commit 158e556b authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1063 from opendatalab/release-0.10.0

Release 0.10.0
parents 038f48d3 30be5017
......@@ -3,13 +3,6 @@
name: mineru
on:
push:
branches:
- "master"
- "dev"
paths-ignore:
- "cmds/**"
- "**.md"
pull_request:
branches:
- "master"
......
......@@ -20,6 +20,7 @@ jobs:
source activate mineru
conda env list
pip show coverage
git checkout "dev"
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
......
......@@ -10,7 +10,6 @@ on:
paths-ignore:
- "cmds/**"
- "**.md"
workflow_dispatch:
jobs:
cli-test:
if: github.repository == 'opendatalab/MinerU'
......
......@@ -42,6 +42,9 @@
</div>
# Changelog
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
......
......@@ -42,6 +42,9 @@
</div>
# 更新记录
- 2024/11/22 0.10.0发布,通过引入混合OCR文本提取能力,
- 在公式密集、span区域不规范、部分文本使用图像表现等复杂文本分布场景下获得解析效果的显著提升
- 同时具备文本模式内容提取准确、速度更快与OCR模式span/line区域识别更准的双重优势
- 2024/11/15 0.9.3发布,为表格识别功能接入了[RapidTable](https://github.com/RapidAI/RapidTable),单表解析速度提升10倍以上,准确率更高,显存占用更低
- 2024/11/06 0.9.2发布,为表格识别功能接入了[StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B)模型
- 2024/10/31 0.9.0发布,这是我们进行了大量代码重构的全新版本,解决了众多问题,提升了性能,降低了硬件需求,并提供了更丰富的易用性:
......
"""
span维度自定义字段
"""
"""span维度自定义字段."""
# span是否是跨页合并的
CROSS_PAGE = "cross_page"
CROSS_PAGE = 'cross_page'
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED = "lines_deleted"
LINES_DELETED = 'lines_deleted'
# table recognition max time default value
TABLE_MAX_TIME_VALUE = 400
......@@ -17,39 +15,39 @@ TABLE_MAX_TIME_VALUE = 400
TABLE_MAX_LEN = 480
# table master structure dict
TABLE_MASTER_DICT = "table_master_structure_dict.txt"
TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
# table master dir
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
# pp detect model dir
DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
# pp rec model dir
REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
# pp rec char dict path
REC_CHAR_DICT = "ppocr_keys_v1.txt"
REC_CHAR_DICT = 'ppocr_keys_v1.txt'
# pp rec copy rec directory
PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
# pp rec copy det directory
PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
class MODEL_NAME:
# pp table structure algorithm
TABLE_MASTER = "tablemaster"
TABLE_MASTER = 'tablemaster'
# struct eqtable
STRUCT_EQTABLE = "struct_eqtable"
STRUCT_EQTABLE = 'struct_eqtable'
DocLayout_YOLO = "doclayout_yolo"
DocLayout_YOLO = 'doclayout_yolo'
LAYOUTLMv3 = "layoutlmv3"
LAYOUTLMv3 = 'layoutlmv3'
YOLO_V8_MFD = "yolo_v8_mfd"
YOLO_V8_MFD = 'yolo_v8_mfd'
UniMerNet_v2_Small = "unimernet_small"
UniMerNet_v2_Small = 'unimernet_small'
RAPID_TABLE = "rapid_table"
\ No newline at end of file
RAPID_TABLE = 'rapid_table'
class DropReason:
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = (
'useful_block_horizontal_overlap' # 需保留的block水平覆盖
)
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
Exception = '_exception' # 解析中发生异常
ENCRYPTED = 'encrypted' # PDF是加密的
EMPTY_PDF = 'total_page=0' # PDF页面总数为0
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
TITLE_LEVEL_FAILED = (
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
)
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
SPECIAL_PDF = 'special_pdf'
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
'overlap_blocks_can_t_separation' # 无法分离重叠的block
)
COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
PAGE_NO = 'page-no' # 页码
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
VERTICAL_TEXT = 'vertical-text' # 垂直文本
ROTATE_TEXT = 'rotate-text' # 旋转文本
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
class DropTag:
PAGE_NUMBER = 'page_no'
HEADER = 'header'
FOOTER = 'footer'
FOOTNOTE = 'footnote'
NOT_IN_LAYOUT = 'not_in_layout'
SPAN_OVERLAP = 'span_overlap'
BLOCK_OVERLAP = 'block_overlap'
class MakeMode:
MM_MD = 'mm_markdown'
NLP_MD = 'nlp_markdown'
STANDARD_FORMAT = 'standard_format'
class DropMode:
WHOLE_PDF = 'whole_pdf'
SINGLE_PAGE = 'single_page'
NONE = 'none'
NONE_WITH_REASON = 'none_with_reason'
from enum import Enum
class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
\ No newline at end of file
ISOLATED = 14
......@@ -35,7 +35,7 @@ def read_jsonl(
jsonl_d = [
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
]
for d in jsonl_d[:5]:
for d in jsonl_d:
pdf_path = d.get('file_location', '') or d.get('path', '')
if len(pdf_path) == 0:
raise EmptyData('pdf file location is empty')
......
import math
from loguru import logger
from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
find_top_nearest_text_bbox)
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
TYPE_INLINE_EQUATION = ContentType.InlineEquation
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
......@@ -12,33 +14,30 @@ UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
@DeprecationWarning
def mk_nlp_markdown_1(para_dict: dict):
"""
对排序后的bboxes拼接内容
"""
"""对排序后的bboxes拼接内容."""
content_lst = []
for _, page_info in para_dict.items():
para_blocks = page_info.get("para_blocks")
para_blocks = page_info.get('para_blocks')
if not para_blocks:
continue
for block in para_blocks:
item = block["paras"]
item = block['paras']
for _, p in item.items():
para_text = p["para_text"]
is_title = p["is_para_title"]
para_text = p['para_text']
is_title = p['is_para_title']
title_level = p['para_title_level']
md_title_prefix = "#"*title_level
md_title_prefix = '#' * title_level
if is_title:
content_lst.append(f"{md_title_prefix} {para_text}")
content_lst.append(f'{md_title_prefix} {para_text}')
else:
content_lst.append(para_text)
content_text = "\n\n".join(content_lst)
content_text = '\n\n'.join(content_lst)
return content_text
# 找到目标字符串在段落中的索引
def __find_index(paragraph, target):
index = paragraph.find(target)
......@@ -48,69 +47,76 @@ def __find_index(paragraph, target):
return None
def __insert_string(paragraph, target, postion):
new_paragraph = paragraph[:postion] + target + paragraph[postion:]
def __insert_string(paragraph, target, position):
new_paragraph = paragraph[:position] + target + paragraph[position:]
return new_paragraph
def __insert_after(content, image_content, target):
"""
在content中找到target,将image_content插入到target后面
"""
"""在content中找到target,将image_content插入到target后面."""
index = content.find(target)
if index != -1:
content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
content = (
content[: index + len(target)]
+ '\n\n'
+ image_content
+ '\n\n'
+ content[index + len(target) :]
)
else:
logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
logger.error(
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
)
return content
def __insert_before(content, image_content, target):
"""
在content中找到target,将image_content插入到target前面
"""
"""在content中找到target,将image_content插入到target前面."""
index = content.find(target)
if index != -1:
content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
else:
logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
logger.error(
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
)
return content
@DeprecationWarning
def mk_mm_markdown_1(para_dict: dict):
"""拼装多模态markdown"""
"""拼装多模态markdown."""
content_lst = []
for _, page_info in para_dict.items():
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks")
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get('para_blocks')
pymu_raw_blocks = page_info.get('preproc_blocks')
all_page_images = []
all_page_images.extend(page_info.get("images",[]))
all_page_images.extend(page_info.get("image_backup", []) )
all_page_images.extend(page_info.get("tables",[]))
all_page_images.extend(page_info.get("table_backup",[]) )
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
all_page_images.extend(page_info.get('images', []))
all_page_images.extend(page_info.get('image_backup', []))
all_page_images.extend(page_info.get('tables', []))
all_page_images.extend(page_info.get('table_backup', []))
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
for img in all_page_images:
page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
page_md = "\n\n".join(page_lst)
page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
page_md = '\n\n'.join(page_lst)
else:
for block in para_blocks:
item = block["paras"]
item = block['paras']
for _, p in item.items():
para_text = p["para_text"]
is_title = p["is_para_title"]
para_text = p['para_text']
is_title = p['is_para_title']
title_level = p['para_title_level']
md_title_prefix = "#"*title_level
md_title_prefix = '#' * title_level
if is_title:
page_lst.append(f"{md_title_prefix} {para_text}")
page_lst.append(f'{md_title_prefix} {para_text}')
else:
page_lst.append(para_text)
"""拼装成一个页面的文本"""
page_md = "\n\n".join(page_lst)
page_md = '\n\n'.join(page_lst)
"""插入图片"""
for img in all_page_images:
imgbox = img['bbox']
......@@ -118,192 +124,215 @@ def mk_mm_markdown_1(para_dict: dict):
# 先看在哪个block内
for block in pymu_raw_blocks:
bbox = block['bbox']
if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
for l in block['lines']:
if (
bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
): # 确定在block内
for l in block['lines']: # noqa: E741
line_box = l['bbox']
if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面
line_txt = "".join([s['text'] for s in l['spans']])
page_md = __insert_before(page_md, img_content, line_txt)
if (
line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
): # 在line内的,插入line前面
line_txt = ''.join([s['text'] for s in l['spans']])
page_md = __insert_before(
page_md, img_content, line_txt
)
break
break
else:# 在行与行之间
else: # 在行与行之间
# 找到图片x0,y0与line的x0,y0最近的line
min_distance = 100000
min_line = None
for l in block['lines']:
for l in block['lines']: # noqa: E741
line_box = l['bbox']
distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
distance = math.sqrt(
(line_box[0] - imgbox[0]) ** 2
+ (line_box[1] - imgbox[1]) ** 2
)
if distance < min_distance:
min_distance = distance
min_line = l
if min_line:
line_txt = "".join([s['text'] for s in min_line['spans']])
line_txt = ''.join(
[s['text'] for s in min_line['spans']]
)
img_h = imgbox[3] - imgbox[1]
if min_distance<img_h: # 文字在图片前面
page_md = __insert_after(page_md, img_content, line_txt)
if min_distance < img_h: # 文字在图片前面
page_md = __insert_after(
page_md, img_content, line_txt
)
else:
page_md = __insert_before(page_md, img_content, line_txt)
page_md = __insert_before(
page_md, img_content, line_txt
)
else:
logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
else:# 应当在两个block之间
logger.error(
f"Can't find the location of image {img['image_path']} in the markdown file #1"
)
else: # 应当在两个block之间
# 找到上方最近的block,如果上方没有就找大下方最近的block
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
if top_txt_block:
line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
line_txt = ''.join(
[s['text'] for s in top_txt_block['lines'][-1]['spans']]
)
page_md = __insert_after(page_md, img_content, line_txt)
else:
bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
bottom_txt_block = find_bottom_nearest_text_bbox(
pymu_raw_blocks, imgbox
)
if bottom_txt_block:
line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
line_txt = ''.join(
[
s['text']
for s in bottom_txt_block['lines'][0]['spans']
]
)
page_md = __insert_before(page_md, img_content, line_txt)
else:
logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
logger.error(
f"Can't find the location of image {img['image_path']} in the markdown file #2"
)
content_lst.append(page_md)
"""拼装成全部页面的文本"""
content_text = "\n\n".join(content_lst)
content_text = '\n\n'.join(content_lst)
return content_text
def __insert_after_para(text, type, element, content_list):
"""
在content_list中找到text,将image_path作为一个新的node插入到text后面
"""
"""在content_list中找到text,将image_path作为一个新的node插入到text后面."""
for i, c in enumerate(content_list):
content_type = c.get("type")
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
if type == "image":
content_type = c.get('type')
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
if type == 'image':
content_node = {
"type": "image",
"img_path": element.get("image_path"),
"img_alt": "",
"img_title": "",
"img_caption": "",
'type': 'image',
'img_path': element.get('image_path'),
'img_alt': '',
'img_title': '',
'img_caption': '',
}
elif type == "table":
elif type == 'table':
content_node = {
"type": "table",
"img_path": element.get("image_path"),
"table_latex": element.get("text"),
"table_title": "",
"table_caption": "",
"table_quality": element.get("quality"),
'type': 'table',
'img_path': element.get('image_path'),
'table_latex': element.get('text'),
'table_title': '',
'table_caption': '',
'table_quality': element.get('quality'),
}
content_list.insert(i+1, content_node)
content_list.insert(i + 1, content_node)
break
else:
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
logger.error(
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
)
def __insert_before_para(text, type, element, content_list):
"""
在content_list中找到text,将image_path作为一个新的node插入到text前面
"""
"""在content_list中找到text,将image_path作为一个新的node插入到text前面."""
for i, c in enumerate(content_list):
content_type = c.get("type")
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
if type == "image":
content_type = c.get('type')
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
if type == 'image':
content_node = {
"type": "image",
"img_path": element.get("image_path"),
"img_alt": "",
"img_title": "",
"img_caption": "",
'type': 'image',
'img_path': element.get('image_path'),
'img_alt': '',
'img_title': '',
'img_caption': '',
}
elif type == "table":
elif type == 'table':
content_node = {
"type": "table",
"img_path": element.get("image_path"),
"table_latex": element.get("text"),
"table_title": "",
"table_caption": "",
"table_quality": element.get("quality"),
'type': 'table',
'img_path': element.get('image_path'),
'table_latex': element.get('text'),
'table_title': '',
'table_caption': '',
'table_quality': element.get('quality'),
}
content_list.insert(i, content_node)
break
else:
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
logger.error(
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
)
def mk_universal_format(pdf_info_list: list, img_buket_path):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
"""构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
content_lst = []
for page_info in pdf_info_list:
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks")
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get('para_blocks')
pymu_raw_blocks = page_info.get('preproc_blocks')
all_page_images = []
all_page_images.extend(page_info.get("images",[]))
all_page_images.extend(page_info.get("image_backup", []) )
all_page_images.extend(page_info.get('images', []))
all_page_images.extend(page_info.get('image_backup', []))
# all_page_images.extend(page_info.get("tables",[]))
# all_page_images.extend(page_info.get("table_backup",[]) )
all_page_tables = []
all_page_tables.extend(page_info.get("tables", []))
all_page_tables.extend(page_info.get('tables', []))
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
for img in all_page_images:
content_node = {
"type": "image",
"img_path": join_path(img_buket_path, img['image_path']),
"img_alt":"",
"img_title":"",
"img_caption":""
'type': 'image',
'img_path': join_path(img_buket_path, img['image_path']),
'img_alt': '',
'img_title': '',
'img_caption': '',
}
page_lst.append(content_node) # TODO 图片顺序
page_lst.append(content_node) # TODO 图片顺序
for table in all_page_tables:
content_node = {
"type": "table",
"img_path": join_path(img_buket_path, table['image_path']),
"table_latex": table.get("text"),
"table_title": "",
"table_caption": "",
"table_quality": table.get("quality"),
'type': 'table',
'img_path': join_path(img_buket_path, table['image_path']),
'table_latex': table.get('text'),
'table_title': '',
'table_caption': '',
'table_quality': table.get('quality'),
}
page_lst.append(content_node) # TODO 图片顺序
page_lst.append(content_node) # TODO 图片顺序
else:
for block in para_blocks:
item = block["paras"]
item = block['paras']
for _, p in item.items():
font_type = p['para_font_type']# 对于文本来说,要么是普通文本,要么是个行间公式
font_type = p[
'para_font_type'
] # 对于文本来说,要么是普通文本,要么是个行间公式
if font_type == TYPE_INTERLINE_EQUATION:
content_node = {
"type": "equation",
"latex": p["para_text"]
}
content_node = {'type': 'equation', 'latex': p['para_text']}
page_lst.append(content_node)
else:
para_text = p["para_text"]
is_title = p["is_para_title"]
para_text = p['para_text']
is_title = p['is_para_title']
title_level = p['para_title_level']
if is_title:
content_node = {
"type": f"h{title_level}",
"text": para_text
'type': f'h{title_level}',
'text': para_text,
}
page_lst.append(content_node)
else:
content_node = {
"type": "text",
"text": para_text
}
content_node = {'type': 'text', 'text': para_text}
page_lst.append(content_node)
content_lst.extend(page_lst)
"""插入图片"""
for img in all_page_images:
insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
"""插入表格"""
for table in all_page_tables:
insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
# end for
return content_lst
......@@ -313,13 +342,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
# 先看在哪个block内
for block in pymu_raw_blocks:
bbox = block['bbox']
if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
3] + 1: # 确定在这个大的block内,然后进入逐行比较距离
for l in block['lines']:
if (
bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
): # 确定在这个大的block内,然后进入逐行比较距离
for l in block['lines']: # noqa: E741
line_box = l['bbox']
if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
3] + 1: # 在line内的,插入line前面
line_txt = "".join([s['text'] for s in l['spans']])
if (
line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
): # 在line内的,插入line前面
line_txt = ''.join([s['text'] for s in l['spans']])
__insert_before_para(line_txt, type, element, content_lst)
break
break
......@@ -327,14 +360,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
# 找到图片x0,y0与line的x0,y0最近的line
min_distance = 100000
min_line = None
for l in block['lines']:
for l in block['lines']: # noqa: E741
line_box = l['bbox']
distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
distance = math.sqrt(
(line_box[0] - element_bbox[0]) ** 2
+ (line_box[1] - element_bbox[1]) ** 2
)
if distance < min_distance:
min_distance = distance
min_line = l
if min_line:
line_txt = "".join([s['text'] for s in min_line['spans']])
line_txt = ''.join([s['text'] for s in min_line['spans']])
img_h = element_bbox[3] - element_bbox[1]
if min_distance < img_h: # 文字在图片前面
__insert_after_para(line_txt, type, element, content_lst)
......@@ -342,56 +378,61 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
__insert_before_para(line_txt, type, element, content_lst)
break
else:
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
logger.error(
f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
)
else: # 应当在两个block之间
# 找到上方最近的block,如果上方没有就找大下方最近的block
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
if top_txt_block:
line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
__insert_after_para(line_txt, type, element, content_lst)
else:
bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
bottom_txt_block = find_bottom_nearest_text_bbox(
pymu_raw_blocks, element_bbox
)
if bottom_txt_block:
line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
line_txt = ''.join(
[s['text'] for s in bottom_txt_block['lines'][0]['spans']]
)
__insert_before_para(line_txt, type, element, content_lst)
else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
logger.error(
f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
)
def mk_mm_markdown(content_list):
"""
基于同一格式的内容列表,构造markdown,含图片
"""
"""基于同一格式的内容列表,构造markdown,含图片."""
content_md = []
for c in content_list:
content_type = c.get("type")
if content_type == "text":
content_md.append(c.get("text"))
elif content_type == "equation":
content = c.get("latex")
if content.startswith("$$") and content.endswith("$$"):
content_type = c.get('type')
if content_type == 'text':
content_md.append(c.get('text'))
elif content_type == 'equation':
content = c.get('latex')
if content.startswith('$$') and content.endswith('$$'):
content_md.append(content)
else:
content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
elif content_type in UNI_FORMAT_TEXT_TYPE:
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
elif content_type == "image":
elif content_type == 'image':
content_md.append(f"![]({c.get('img_path')})")
return "\n\n".join(content_md)
return '\n\n'.join(content_md)
def mk_nlp_markdown(content_list):
"""
基于同一格式的内容列表,构造markdown,不含图片
"""
"""基于同一格式的内容列表,构造markdown,不含图片."""
content_md = []
for c in content_list:
content_type = c.get("type")
if content_type == "text":
content_md.append(c.get("text"))
elif content_type == "equation":
content_type = c.get('type')
if content_type == 'text':
content_md.append(c.get('text'))
elif content_type == 'equation':
content_md.append(f"$$\n{c.get('latex')}\n$$")
elif content_type == "table":
elif content_type == 'table':
content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
elif content_type in UNI_FORMAT_TEXT_TYPE:
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
return "\n\n".join(content_md)
\ No newline at end of file
return '\n\n'.join(content_md)
......@@ -2,21 +2,20 @@ import re
from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
from magic_pdf.para.para_split_v3 import ListLineTag
def __is_hyphen_at_line_end(line):
"""
Check if a line ends with one or more letters followed by a hyphen.
"""Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
......@@ -142,9 +141,10 @@ def merge_para_with_text(para_block):
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for span in line['spans']:
for j, span in enumerate(line['spans']):
span_type = span['type']
content = ''
......@@ -162,16 +162,16 @@ def merge_para_with_text(para_block):
if span_type in [ContentType.Text, ContentType.InterlineEquation]:
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
elif span_type == ContentType.InlineEquation:
para_text += f" {content} "
para_text += f' {content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果是前一行带有-连字符,那么末尾不应该加空格
if __is_hyphen_at_line_end(content):
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
para_text += content[:-1]
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
para_text += content
else: # 西方文本语境下 content间需要空格分隔
para_text += f"{content} "
para_text += f'{content} '
elif span_type == ContentType.InterlineEquation:
para_text += content
else:
......
"""
输入: s3路径,每行一个
输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
"""
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
import sys
import click
from collections import Counter
from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
from magic_pdf.libs.commons import fitz
import click
from loguru import logger
from collections import Counter
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars
......@@ -19,8 +16,10 @@ junk_limit_min = 10
def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
result]
max_image_area_per_page = [
mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz])
for page_img_sz in result
]
page_area = int(page_width_pts) * int(page_height_pts)
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
......@@ -32,8 +31,10 @@ def process_image(page, junk_img_bojids=[]):
items = page.get_images()
dedup = set()
for img in items:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid = img[
0
] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
continue
recs = page.get_image_rects(img, transform=True)
......@@ -42,9 +43,17 @@ def process_image(page, junk_img_bojids=[]):
x0, y0, x1, y1 = map(int, rec)
width = x1 - x0
height = y1 - y0
if (x0, y0, x1, y1, img_bojid) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
if (
x0,
y0,
x1,
y1,
img_bojid,
) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉
continue
if not all([width, height]): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
if not all(
[width, height]
): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
continue
dedup.add((x0, y0, x1, y1, img_bojid))
page_result.append([x0, y0, x1, y1, img_bojid])
......@@ -52,29 +61,33 @@ def process_image(page, junk_img_bojids=[]):
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
"""
返回每个页面里的图片的四元组,每个页面多个图片。
"""返回每个页面里的图片的四元组,每个页面多个图片。
:param doc:
:return:
"""
# 使用 Counter 计数 img_bojid 的出现次数
# 使用 Counter 计数 img_bojid 的出现次数
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
# 找出出现次数超过 len(doc) 半数的 img_bojid
# 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
#todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
#有两种扫描版,一种文字版,这里可能会有误判
#扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
#扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
#文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
junk_img_bojids = [
img_bojid
for img_bojid, count in img_bojid_counter.items()
if count >= junk_limit
]
# todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
# 有两种扫描版,一种文字版,这里可能会有误判
# 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
# 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
# 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
imgs_len_list = [len(page.get_images()) for page in doc]
special_limit_pages = 10
# 统一用前十页结果做判断
# 统一用前十页结果做判断
result = []
break_loop = False
for i, page in enumerate(doc):
......@@ -82,12 +95,18 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
break
if i >= special_limit_pages:
break
page_result = process_image(page) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
page_result = process_image(
page
) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
result.append(page_result)
for item in result:
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if max(imgs_len_list) == min(imgs_len_list) and max(
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
if not any(
item
): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if (
max(imgs_len_list) == min(imgs_len_list)
and max(imgs_len_list) >= junk_limit_min
): # 如果是特殊文字版,就把junklist置空并break
junk_img_bojids = []
else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass
......@@ -98,20 +117,23 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
# 检查前80%的元素是否都相等
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
# 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page = calculate_max_image_area_per_page(
result, page_width_pts, page_height_pts
)
if (
len(max_image_area_per_page) < 0.8 * special_limit_pages
): # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids = []
else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass
else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids = []
#正式进入取前50页图片的信息流程
# 正式进入取前50页图片的信息流程
result = []
for i, page in enumerate(doc):
if i >= scan_max_page:
......@@ -126,7 +148,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
def get_pdf_page_size_pts(doc: fitz.Document):
page_cnt = len(doc)
l: int = min(page_cnt, 50)
#把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
# 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
page_width_list = []
page_height_list = []
for i in range(l):
......@@ -152,8 +174,8 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
# 拿所有text的blocks
# text_block = page.get_text("words")
# text_block_len = sum([len(t[4]) for t in text_block])
#拿所有text的str
text_block = page.get_text("text")
# 拿所有text的str
text_block = page.get_text('text')
text_block_len = len(text_block)
# logger.info(f"page {page.number} text_block_len: {text_block_len}")
text_len_lst.append(text_block_len)
......@@ -162,15 +184,13 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
def get_pdf_text_layout_per_page(doc: fitz.Document):
"""
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
"""根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
List[str]: 每一页的文本布局(横向、纵向、未知)。
"""
text_layout_list = []
......@@ -180,11 +200,11 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
# 创建每一页的纵向和横向的文本行数计数器
vertical_count = 0
horizontal_count = 0
text_dict = page.get_text("dict")
if "blocks" in text_dict:
for block in text_dict["blocks"]:
text_dict = page.get_text('dict')
if 'blocks' in text_dict:
for block in text_dict['blocks']:
if 'lines' in block:
for line in block["lines"]:
for line in block['lines']:
# 获取line的bbox顶点坐标
x0, y0, x1, y1 = line['bbox']
# 计算bbox的宽高
......@@ -199,8 +219,12 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
if len(font_sizes) > 0:
average_font_size = sum(font_sizes) / len(font_sizes)
else:
average_font_size = 10 # 有的line拿不到font_size,先定一个阈值100
if area <= average_font_size ** 2: # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
average_font_size = (
10 # 有的line拿不到font_size,先定一个阈值100
)
if (
area <= average_font_size**2
): # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
continue
else:
if 'wmode' in line: # 通过wmode判断文本方向
......@@ -228,22 +252,22 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
# print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
# 判断每一页的文本布局
if vertical_count == 0 and horizontal_count == 0: # 该页没有文本,无法判断
text_layout_list.append("unknow")
text_layout_list.append('unknow')
continue
else:
if vertical_count > horizontal_count: # 该页的文本纵向行数大于横向的
text_layout_list.append("vertical")
text_layout_list.append('vertical')
else: # 该页的文本横向行数大于纵向的
text_layout_list.append("horizontal")
text_layout_list.append('horizontal')
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return text_layout_list
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
"""定义一个自定义异常用来抛出单页svg太多的pdf"""
class PageSvgsTooManyError(Exception):
def __init__(self, message="Page SVGs are too many"):
def __init__(self, message='Page SVGs are too many'):
self.message = message
super().__init__(self.message)
......@@ -285,7 +309,7 @@ def get_language(doc: fitz.Document):
if page_id >= scan_max_page:
break
# 拿所有text的str
text_block = page.get_text("text")
text_block = page.get_text('text')
page_language = detect_lang(text_block)
language_lst.append(page_language)
......@@ -299,9 +323,7 @@ def get_language(doc: fitz.Document):
def check_invalid_chars(pdf_bytes):
"""
乱码检测
"""
"""乱码检测."""
return detect_invalid_chars(pdf_bytes)
......@@ -311,13 +333,13 @@ def pdf_meta_scan(pdf_bytes: bytes):
:param pdf_bytes: pdf文件的二进制数据
几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
"""
doc = fitz.open("pdf", pdf_bytes)
doc = fitz.open('pdf', pdf_bytes)
is_needs_password = doc.needs_pass
is_encrypted = doc.is_encrypted
total_page = len(doc)
if total_page == 0:
logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
logger.warning(f'drop this pdf, drop_reason: {DropReason.EMPTY_PDF}')
result = {'_need_drop': True, '_drop_reason': DropReason.EMPTY_PDF}
return result
else:
page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
......@@ -328,7 +350,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
imgs_per_page = get_imgs_per_page(doc)
# logger.info(f"imgs_per_page: {imgs_per_page}")
image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
image_info_per_page, junk_img_bojids = get_image_info(
doc, page_width_pts, page_height_pts
)
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page = get_pdf_textlen_per_page(doc)
# logger.info(f"text_len_per_page: {text_len_per_page}")
......@@ -341,20 +365,20 @@ def pdf_meta_scan(pdf_bytes: bytes):
# 最后输出一条json
res = {
"is_needs_password": is_needs_password,
"is_encrypted": is_encrypted,
"total_page": total_page,
"page_width_pts": int(page_width_pts),
"page_height_pts": int(page_height_pts),
"image_info_per_page": image_info_per_page,
"text_len_per_page": text_len_per_page,
"text_layout_per_page": text_layout_per_page,
"text_language": text_language,
'is_needs_password': is_needs_password,
'is_encrypted': is_encrypted,
'total_page': total_page,
'page_width_pts': int(page_width_pts),
'page_height_pts': int(page_height_pts),
'image_info_per_page': image_info_per_page,
'text_len_per_page': text_len_per_page,
'text_layout_per_page': text_layout_per_page,
'text_language': text_language,
# "svgs_per_page": svgs_per_page,
"imgs_per_page": imgs_per_page, # 增加每页img数量list
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
"invalid_chars": invalid_chars,
"metadata": doc.metadata
'imgs_per_page': imgs_per_page, # 增加每页img数量list
'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
'invalid_chars': invalid_chars,
'metadata': doc.metadata,
}
# logger.info(json.dumps(res, ensure_ascii=False))
return res
......@@ -364,14 +388,12 @@ def pdf_meta_scan(pdf_bytes: bytes):
@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
@click.option('--s3-profile', help='s3上的profile')
def main(s3_pdf_path: str, s3_profile: str):
"""
"""
""""""
try:
file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(file_content)
except Exception as e:
print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
logger.exception(e)
......@@ -381,7 +403,7 @@ if __name__ == '__main__':
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
# "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
# file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
# file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
......
......@@ -5,14 +5,13 @@ from pathlib import Path
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
ElementRelation, ElementRelType,
LayoutElements,
LayoutElementsExtra, PageInfo)
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, prepare_env
......@@ -224,8 +223,8 @@ def inference(path, output_dir, method):
str(Path(path).stem), method)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
try:
......
class MakeMode:
MM_MD = "mm_markdown"
NLP_MD = "nlp_markdown"
STANDARD_FORMAT = "standard_format"
class DropMode:
WHOLE_PDF = "whole_pdf"
SINGLE_PAGE = "single_page"
NONE = "none"
NONE_WITH_REASON = "none_with_reason"
......@@ -5,7 +5,7 @@ import os
from loguru import logger
from magic_pdf.libs.Constants import MODEL_NAME
from magic_pdf.config.constants import MODEL_NAME
from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量
......@@ -99,7 +99,7 @@ def get_table_recog_config():
def get_layout_config():
config = read_config()
layout_config = config.get("layout-config")
layout_config = config.get('layout-config')
if layout_config is None:
logger.warning(f"'layout-config' not found in {CONFIG_FILE_NAME}, use '{MODEL_NAME.LAYOUTLMv3}' as default")
return json.loads(f'{{"model": "{MODEL_NAME.LAYOUTLMv3}"}}')
......@@ -109,7 +109,7 @@ def get_layout_config():
def get_formula_config():
config = read_config()
formula_config = config.get("formula-config")
formula_config = config.get('formula-config')
if formula_config is None:
logger.warning(f"'formula-config' not found in {CONFIG_FILE_NAME}, use 'True' as default")
return json.loads(f'{{"mfd_model": "{MODEL_NAME.YOLO_V8_MFD}","mfr_model": "{MODEL_NAME.UniMerNet_v2_Small}","enable": true}}')
......@@ -117,5 +117,5 @@ def get_formula_config():
return formula_config
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw")
if __name__ == '__main__':
ak, sk, endpoint = get_s3_config('llm-raw')
from magic_pdf.config.constants import CROSS_PAGE
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
ContentType)
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
from magic_pdf.model.magic_model import MagicModel
......
class DropReason:
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
Exception = "_exception" # 解析中发生异常
ENCRYPTED = "encrypted" # PDF是加密的
EMPTY_PDF = "total_page=0" # PDF页面总数为0
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
SPECIAL_PDF = "special_pdf"
PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment