Unverified Commit 0c7a0882 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2611 from myhloli/dev

Dev
parents 3bd0ecf1 a392f445
from enum import Enum
from pydantic import BaseModel, Field
# rag
class CategoryType(Enum): # py310 not support StrEnum
text = 'text'
title = 'title'
interline_equation = 'interline_equation'
image = 'image'
image_body = 'image_body'
image_caption = 'image_caption'
table = 'table'
table_body = 'table_body'
table_caption = 'table_caption'
table_footnote = 'table_footnote'
class ElementRelType(Enum):
sibling = 'sibling'
class PageInfo(BaseModel):
page_no: int = Field(description='the index of page, start from zero',
ge=0)
height: int = Field(description='the height of page', gt=0)
width: int = Field(description='the width of page', ge=0)
image_path: str | None = Field(description='the image of this page',
default=None)
class ContentObject(BaseModel):
category_type: CategoryType = Field(description='类别')
poly: list[float] = Field(
description=('Coordinates, need to convert back to PDF coordinates,'
' order is top-left, top-right, bottom-right, bottom-left'
' x,y coordinates'))
ignore: bool = Field(description='whether ignore this object',
default=False)
text: str | None = Field(description='text content of the object',
default=None)
image_path: str | None = Field(description='path of embedded image',
default=None)
order: int = Field(description='the order of this object within a page',
default=-1)
anno_id: int = Field(description='unique id', default=-1)
latex: str | None = Field(description='latex result', default=None)
html: str | None = Field(description='html result', default=None)
class ElementRelation(BaseModel):
source_anno_id: int = Field(description='unique id of the source object',
default=-1)
target_anno_id: int = Field(description='unique id of the target object',
default=-1)
relation: ElementRelType = Field(
description='the relation between source and target element')
class LayoutElementsExtra(BaseModel):
element_relation: list[ElementRelation] = Field(
description='the relation between source and target element')
class LayoutElements(BaseModel):
layout_dets: list[ContentObject] = Field(
description='layout element details')
page_info: PageInfo = Field(description='page info')
extra: LayoutElementsExtra = Field(description='extra information')
# iter data format
class Node(BaseModel):
category_type: CategoryType = Field(description='类别')
text: str | None = Field(description='text content of the object',
default=None)
image_path: str | None = Field(description='path of embedded image',
default=None)
anno_id: int = Field(description='unique id', default=-1)
latex: str | None = Field(description='latex result', default=None)
html: str | None = Field(description='html result', default=None)
import json
import os
from pathlib import Path
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
ElementRelation, ElementRelType,
LayoutElements,
LayoutElementsExtra, PageInfo)
from magic_pdf.tools.common import do_parse, prepare_env
def convert_middle_json_to_layout_elements(
json_data: dict,
output_dir: str,
) -> list[LayoutElements]:
uniq_anno_id = 0
res: list[LayoutElements] = []
for page_no, page_data in enumerate(json_data['pdf_info']):
order_id = 0
page_info = PageInfo(
height=int(page_data['page_size'][1]),
width=int(page_data['page_size'][0]),
page_no=page_no,
)
layout_dets: list[ContentObject] = []
extra_element_relation: list[ElementRelation] = []
for para_block in page_data['para_blocks']:
para_text = ''
para_type = para_block['type']
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.text,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.Title:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.title,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.interline_equation,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.Image:
body_anno_id = -1
caption_anno_id = -1
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.image_body,
image_path=os.path.join(
output_dir, span['image_path']),
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
body_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.image_caption,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
caption_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
if body_anno_id > 0 and caption_anno_id > 0:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=caption_anno_id,
)
extra_element_relation.append(element_relation)
elif para_type == BlockType.Table:
body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
for block in para_block['blocks']:
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_caption,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
caption_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Table:
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_body,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
body_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
# if processed by table model
if span.get('latex', ''):
content.latex = span['latex']
else:
content.image_path = os.path.join(
output_dir, span['image_path'])
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_footnote,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
footnote_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
if caption_anno_id != -1 and body_anno_id != -1:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=caption_anno_id,
)
extra_element_relation.append(element_relation)
if footnote_anno_id != -1 and body_anno_id != -1:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=footnote_anno_id,
)
extra_element_relation.append(element_relation)
res.append(
LayoutElements(
page_info=page_info,
layout_dets=layout_dets,
extra=LayoutElementsExtra(
element_relation=extra_element_relation),
))
return res
def inference(path, output_dir, method):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
if output_dir == '':
if os.path.isdir(path):
output_dir = os.path.join(path, 'output')
else:
output_dir = os.path.join(os.path.dirname(path), 'output')
local_image_dir, local_md_dir = prepare_env(output_dir,
str(Path(path).stem), method)
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
False,
f_draw_span_bbox=False,
f_draw_layout_bbox=False,
f_dump_md=False,
f_dump_middle_json=True,
f_dump_model_json=False,
f_dump_orig_pdf=False,
f_dump_content_list=False,
f_draw_model_bbox=False,
)
middle_json_fn = os.path.join(local_md_dir,
f'{file_name}_middle.json')
with open(middle_json_fn) as fd:
jso = json.load(fd)
os.remove(middle_json_fn)
return convert_middle_json_to_layout_elements(jso, local_image_dir)
except Exception as e:
logger.exception(e)
return parse_doc(path)
if __name__ == '__main__':
import pprint
base_dir = '/opt/data/pdf/resources/samples/'
if 0:
with open(base_dir + 'json_outputs/middle.json') as f:
d = json.load(f)
result = convert_middle_json_to_layout_elements(d, '/tmp')
pprint.pp(result)
if 0:
with open(base_dir + 'json_outputs/middle.3.json') as f:
d = json.load(f)
result = convert_middle_json_to_layout_elements(d, '/tmp')
pprint.pp(result)
if 1:
res = inference(
base_dir + 'samples/pdf/one_page_with_table_image.pdf',
'/tmp/output',
'ocr',
)
pprint.pp(res)
# Copyright (c) Opendatalab. All rights reserved.
import torch
import gc
def clean_memory(device='cuda'):
if device == 'cuda':
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
elif str(device).startswith("npu"):
import torch_npu
if torch_npu.npu.is_available():
torch_npu.npu.empty_cache()
elif str(device).startswith("mps"):
torch.mps.empty_cache()
gc.collect()
\ No newline at end of file
def join_path(*args):
return '/'.join(str(s).rstrip('/') for s in args)
def get_top_percent_list(num_list, percent):
"""
获取列表中前百分之多少的元素
:param num_list:
:param percent:
:return:
"""
if len(num_list) == 0:
top_percent_list = []
else:
# 对imgs_len_list排序
sorted_imgs_len_list = sorted(num_list, reverse=True)
# 计算 percent 的索引
top_percent_index = int(len(sorted_imgs_len_list) * percent)
# 取前80%的元素
top_percent_list = sorted_imgs_len_list[:top_percent_index]
return top_percent_list
def mymax(alist: list):
if len(alist) == 0:
return 0 # 空是0, 0*0也是0大小q
else:
return max(alist)
def parse_bucket_key(s3_full_path: str):
"""
输入 s3://bucket/path/to/my/file.txt
输出 bucket, path/to/my/file.txt
"""
s3_full_path = s3_full_path.strip()
if s3_full_path.startswith("s3://"):
s3_full_path = s3_full_path[5:]
if s3_full_path.startswith("/"):
s3_full_path = s3_full_path[1:]
bucket, key = s3_full_path.split("/", 1)
return bucket, key
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = model_page_info['page_info']['width']
height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
import fitz
from magic_pdf.config.constants import CROSS_PAGE
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
ContentType)
from magic_pdf.data.dataset import Dataset
from magic_pdf.model.magic_model import MagicModel
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for bbox in page_data:
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
if fill_config:
page.draw_rect(
rect_coords,
color=None,
fill=new_rgb,
fill_opacity=0.3,
width=0.5,
overlay=True,
) # Draw the rectangle
else:
page.draw_rect(
rect_coords,
color=new_rgb,
fill=None,
fill_opacity=1,
width=0.5,
overlay=True,
) # Draw the rectangle
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for j, bbox in enumerate(page_data):
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
if draw_bbox:
if fill_config:
page.draw_rect(
rect_coords,
color=None,
fill=new_rgb,
fill_opacity=0.3,
width=0.5,
overlay=True,
) # Draw the rectangle
else:
page.draw_rect(
rect_coords,
color=new_rgb,
fill=None,
fill_opacity=1,
width=0.5,
overlay=True,
) # Draw the rectangle
page.insert_text(
(x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
) # Insert the index in the top left corner of the rectangle
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
dropped_bbox_list = []
tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = []
texts_list = []
interequations_list = []
lists_list = []
indexs_list = []
for page in pdf_info:
page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = []
texts = []
interequations = []
lists = []
indices = []
for dropped_bbox in page['discarded_blocks']:
page_dropped_list.append(dropped_bbox['bbox'])
dropped_bbox_list.append(page_dropped_list)
for block in page['para_blocks']:
bbox = block['bbox']
if block['type'] == BlockType.Table:
tables.append(bbox)
for nested_block in block['blocks']:
bbox = nested_block['bbox']
if nested_block['type'] == BlockType.TableBody:
tables_body.append(bbox)
elif nested_block['type'] == BlockType.TableCaption:
tables_caption.append(bbox)
elif nested_block['type'] == BlockType.TableFootnote:
tables_footnote.append(bbox)
elif block['type'] == BlockType.Image:
imgs.append(bbox)
for nested_block in block['blocks']:
bbox = nested_block['bbox']
if nested_block['type'] == BlockType.ImageBody:
imgs_body.append(bbox)
elif nested_block['type'] == BlockType.ImageCaption:
imgs_caption.append(bbox)
elif nested_block['type'] == BlockType.ImageFootnote:
imgs_footnote.append(bbox)
elif block['type'] == BlockType.Title:
titles.append(bbox)
elif block['type'] == BlockType.Text:
texts.append(bbox)
elif block['type'] == BlockType.InterlineEquation:
interequations.append(bbox)
elif block['type'] == BlockType.List:
lists.append(bbox)
elif block['type'] == BlockType.Index:
indices.append(bbox)
tables_list.append(tables)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_list.append(imgs)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
lists_list.append(lists)
indexs_list.append(indices)
layout_bbox_list = []
table_type_order = {
'table_caption': 1,
'table_body': 2,
'table_footnote': 3
}
for page in pdf_info:
page_block_list = []
for block in page['para_blocks']:
if block['type'] in [
BlockType.Text,
BlockType.Title,
BlockType.InterlineEquation,
BlockType.List,
BlockType.Index,
]:
bbox = block['bbox']
page_block_list.append(bbox)
elif block['type'] in [BlockType.Image]:
for sub_block in block['blocks']:
bbox = sub_block['bbox']
page_block_list.append(bbox)
elif block['type'] in [BlockType.Table]:
sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
for sub_block in sorted_blocks:
bbox = sub_block['bbox']
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
# draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
# draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
draw_bbox_with_number(
i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
)
# Save the PDF
pdf_docs.save(f'{out_path}/{filename}')
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
text_list = []
inline_equation_list = []
interline_equation_list = []
image_list = []
table_list = []
dropped_list = []
next_page_text_list = []
next_page_inline_equation_list = []
def get_span_info(span):
if span['type'] == ContentType.Text:
if span.get(CROSS_PAGE, False):
next_page_text_list.append(span['bbox'])
else:
page_text_list.append(span['bbox'])
elif span['type'] == ContentType.InlineEquation:
if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span['bbox'])
else:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.InterlineEquation:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.Image:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.Table:
page_table_list.append(span['bbox'])
for page in pdf_info:
page_text_list = []
page_inline_equation_list = []
page_interline_equation_list = []
page_image_list = []
page_table_list = []
page_dropped_list = []
# 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list)
next_page_text_list.clear()
if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list.clear()
# 构造dropped_list
for block in page['discarded_blocks']:
if block['type'] == BlockType.Discarded:
for line in block['lines']:
for span in line['spans']:
page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for block in page['preproc_blocks']:
if block['type'] in [
BlockType.Text,
BlockType.Title,
BlockType.InterlineEquation,
BlockType.List,
BlockType.Index,
]:
for line in block['lines']:
for span in line['spans']:
get_span_info(span)
elif block['type'] in [BlockType.Image, BlockType.Table]:
for sub_block in block['blocks']:
for line in sub_block['lines']:
for span in line['spans']:
get_span_info(span)
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list)
image_list.append(page_image_list)
table_list.append(page_table_list)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
# Save the PDF
pdf_docs.save(f'{out_path}/{filename}')
def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
titles_list = []
texts_list = []
interequations_list = []
magic_model = MagicModel(model_list, dataset)
for i in range(len(model_list)):
page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption, imgs_footnote = [], [], []
titles = []
texts = []
interequations = []
page_info = magic_model.get_model_list(i)
layout_dets = page_info['layout_dets']
for layout_det in layout_dets:
bbox = layout_det['bbox']
if layout_det['category_id'] == CategoryId.Text:
texts.append(bbox)
elif layout_det['category_id'] == CategoryId.Title:
titles.append(bbox)
elif layout_det['category_id'] == CategoryId.TableBody:
tables_body.append(bbox)
elif layout_det['category_id'] == CategoryId.TableCaption:
tables_caption.append(bbox)
elif layout_det['category_id'] == CategoryId.TableFootnote:
tables_footnote.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageBody:
imgs_body.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageCaption:
imgs_caption.append(bbox)
elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageFootnote:
imgs_footnote.append(bbox)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
dropped_bbox_list.append(page_dropped_list)
imgs_footnote_list.append(imgs_footnote)
for i in range(len(dataset)):
page = dataset.get_page(i)
draw_bbox_with_number(
i, dropped_bbox_list, page, [158, 158, 158], True
) # color !
draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
# Save the PDF
dataset.dump_to_file(f'{out_path}/{filename}')
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
for page in pdf_info:
page_line_list = []
for block in page['preproc_blocks']:
if block['type'] in [BlockType.Text]:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]:
if 'virtual_lines' in block:
if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
for line in block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
else:
for line in block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif block['type'] in [BlockType.Image, BlockType.Table]:
for sub_block in block['blocks']:
if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:
for line in sub_block['virtual_lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
else:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
for line in sub_block['lines']:
bbox = line['bbox']
index = line['index']
page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
pdf_docs.save(f'{out_path}/{filename}')
def draw_char_bbox(pdf_bytes, out_path, filename):
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
for line in block['lines']:
for span in line['spans']:
for char in span['chars']:
char_bbox = char['bbox']
page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
pdf_docs.save(f'{out_path}/{filename}')
import json
import brotli
import base64
class JsonCompressor:
@staticmethod
def compress_json(data):
"""
Compress a json object and encode it with base64
"""
json_str = json.dumps(data)
json_bytes = json_str.encode('utf-8')
compressed = brotli.compress(json_bytes, quality=6)
compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string
return compressed_str
@staticmethod
def decompress_json(compressed_str):
"""
Decode the base64 string and decompress the json object
"""
compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes
decompressed_bytes = brotli.decompress(compressed)
json_str = decompressed_bytes.decode('utf-8')
data = json.loads(json_str)
return data
def float_gt(a, b):
if 0.0001 >= abs(a -b):
return False
return a > b
def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
def ocr_escape_special_markdown_char(content):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for char in special_chars:
content = content.replace(char, "\\" + char)
return content
import fitz
import numpy as np
from loguru import logger
import re
from io import BytesIO
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
def calculate_sample_count(total_page: int):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt = min(10, total_page)
return select_page_cnt
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return fitz.Document()
select_page_cnt = calculate_sample_count(total_page)
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
sample_docs = fitz.Document()
try:
for index in page_num:
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
except Exception as e:
logger.exception(e)
return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0:
cid_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
return False # 乱码文档
else:
return True # 正常文档
def count_replacement_characters(text: str) -> int:
"""
统计字符串中 0xfffd 字符的数量。
"""
return text.count('\ufffd')
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
sample_docs = extract_pages(src_pdf_bytes)
doc_text = ""
for page in sample_docs:
page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
doc_text += page_text
text_len = len(doc_text)
uffd_count = count_replacement_characters(doc_text)
if text_len == 0:
uffd_chars_radio = 0
else:
uffd_chars_radio = uffd_count / text_len
logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
if uffd_chars_radio > 0.01:
return False # 乱码文档
else:
return True # 正常文档
\ No newline at end of file
from io import BytesIO
import cv2
import fitz
import numpy as np
from PIL import Image
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: DataWriter):
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
图片存放在save_path下,文件名是:
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
# 拼接文件名
filename = f'{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}'
# 老版本返回不带bucket的路径
img_path = join_path(return_path, filename) if return_path is not None else None
# 新版本生成平铺路径
img_hash256_path = f'{compute_sha256(img_path)}.jpg'
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍
zoom = fitz.Matrix(3, 3)
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(img_hash256_path, byte_data)
return img_hash256_path
def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍
zoom = fitz.Matrix(3, 3)
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
if mode == "cv2":
# 直接转换为numpy数组供cv2使用
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
# PyMuPDF使用RGB顺序,而cv2使用BGR顺序
if pix.n == 3 or pix.n == 4:
image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
else:
image_result = img_array
elif mode == "pillow":
# 将字节数据转换为文件对象
image_file = BytesIO(pix.tobytes(output='png'))
# 使用 Pillow 打开图像
image_result = Image.open(image_file)
else:
raise ValueError(f"mode: {mode} is not supported.")
return image_result
\ No newline at end of file
import time
import functools
from collections import defaultdict
from typing import Dict, List
class PerformanceStats:
"""性能统计类,用于收集和展示方法执行时间"""
_stats: Dict[str, List[float]] = defaultdict(list)
@classmethod
def add_execution_time(cls, func_name: str, execution_time: float):
"""添加执行时间记录"""
cls._stats[func_name].append(execution_time)
@classmethod
def get_stats(cls) -> Dict[str, dict]:
"""获取统计结果"""
results = {}
for func_name, times in cls._stats.items():
results[func_name] = {
'count': len(times),
'total_time': sum(times),
'avg_time': sum(times) / len(times),
'min_time': min(times),
'max_time': max(times)
}
return results
@classmethod
def print_stats(cls):
"""打印统计结果"""
stats = cls.get_stats()
print("\n性能统计结果:")
print("-" * 80)
print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
print("-" * 80)
for func_name, data in stats.items():
print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
def measure_time(func):
"""测量方法执行时间的装饰器"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
execution_time = time.time() - start_time
# 获取更详细的函数标识
if hasattr(func, "__self__"): # 实例方法
class_name = func.__self__.__class__.__name__
full_name = f"{class_name}.{func.__name__}"
elif hasattr(func, "__qualname__"): # 类方法或静态方法
full_name = func.__qualname__
else:
module_name = func.__module__
full_name = f"{module_name}.{func.__name__}"
PerformanceStats.add_execution_time(full_name, execution_time)
return result
return wrapper
\ No newline at end of file
import os
def sanitize_filename(filename, replacement="_"):
if os.name == 'nt':
invalid_chars = '<>:"|?*'
for char in invalid_chars:
filename = filename.replace(char, replacement)
return filename
__use_inside_model__ = True
__model_mode__ = 'full'
\ No newline at end of file
import os
import time
import numpy as np
import torch
os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译
os.environ['FLAGS_use_stride_kernel'] = '0'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
from loguru import logger
from magic_pdf.model.sub_modules.model_utils import get_vram
from magic_pdf.config.enums import SupportedPdfParseMethod
import magic_pdf.model as model_config
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.config_reader import (get_device, get_formula_config,
get_layout_config,
get_local_models_dir,
get_table_recog_config)
from magic_pdf.model.model_list import MODEL
class ModelSingleton:
_instance = None
_models = {}
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def get_model(
self,
ocr: bool,
show_log: bool,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
if key not in self._models:
self._models[key] = custom_model_init(
ocr=ocr,
show_log=show_log,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
return self._models[key]
def custom_model_init(
ocr: bool = False,
show_log: bool = False,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
model = None
if model_config.__model_mode__ == 'lite':
logger.warning(
'The Lite mode is provided for developers to conduct testing only, and the output quality is '
'not guaranteed to be reliable.'
)
model = MODEL.Paddle
elif model_config.__model_mode__ == 'full':
model = MODEL.PEK
if model_config.__use_inside_model__:
model_init_start = time.time()
if model == MODEL.Paddle:
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
elif model == MODEL.PEK:
from magic_pdf.model.pdf_extract_kit import CustomPEKModel
# 从配置文件读取model-dir和device
local_models_dir = get_local_models_dir()
device = get_device()
layout_config = get_layout_config()
if layout_model is not None:
layout_config['model'] = layout_model
formula_config = get_formula_config()
if formula_enable is not None:
formula_config['enable'] = formula_enable
table_config = get_table_recog_config()
if table_enable is not None:
table_config['enable'] = table_enable
model_input = {
'ocr': ocr,
'show_log': show_log,
'models_dir': local_models_dir,
'device': device,
'table_config': table_config,
'layout_config': layout_config,
'formula_config': formula_config,
'lang': lang,
}
custom_model = CustomPEKModel(**model_input)
else:
logger.error('Not allow model_name!')
exit(1)
model_init_cost = time.time() - model_init_start
logger.info(f'model init cost: {model_init_cost}')
else:
logger.error('use_inside_model is False, not allow to use inside model')
exit(1)
return custom_model
def doc_analyze(
dataset: Dataset,
ocr: bool = False,
show_log: bool = False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(dataset) - 1
)
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
images = []
page_wh_list = []
for index in range(len(dataset)):
if start_page_id <= index <= end_page_id:
page_data = dataset.get_page(index)
img_dict = page_data.get_image()
images.append(img_dict['img'])
page_wh_list.append((img_dict['width'], img_dict['height']))
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
if len(images) >= MIN_BATCH_INFERENCE_SIZE:
batch_size = MIN_BATCH_INFERENCE_SIZE
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
else:
batch_images = [images_with_extra_info]
results = []
processed_images_count = 0
for index, batch_image in enumerate(batch_images):
processed_images_count += len(batch_image)
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
results.extend(result)
model_json = []
for index in range(len(dataset)):
if start_page_id <= index <= end_page_id:
result = results.pop(0)
page_width, page_height = page_wh_list.pop(0)
else:
result = []
page_height = 0
page_width = 0
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict)
from magic_pdf.operators.models import InferenceResult
return InferenceResult(model_json, dataset)
def batch_doc_analyze(
datasets: list[Dataset],
parse_method: str = 'auto',
show_log: bool = False,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
batch_size = MIN_BATCH_INFERENCE_SIZE
page_wh_list = []
images_with_extra_info = []
for dataset in datasets:
ocr = False
if parse_method == 'auto':
if dataset.classify() == SupportedPdfParseMethod.TXT:
ocr = False
elif dataset.classify() == SupportedPdfParseMethod.OCR:
ocr = True
elif parse_method == 'ocr':
ocr = True
elif parse_method == 'txt':
ocr = False
_lang = dataset._lang
for index in range(len(dataset)):
page_data = dataset.get_page(index)
img_dict = page_data.get_image()
page_wh_list.append((img_dict['width'], img_dict['height']))
images_with_extra_info.append((img_dict['img'], ocr, _lang))
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
results = []
processed_images_count = 0
for index, batch_image in enumerate(batch_images):
processed_images_count += len(batch_image)
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
result = may_batch_image_analyze(batch_image, True, show_log, layout_model, formula_enable, table_enable)
results.extend(result)
infer_results = []
from magic_pdf.operators.models import InferenceResult
for index in range(len(datasets)):
dataset = datasets[index]
model_json = []
for i in range(len(dataset)):
result = results.pop(0)
page_width, page_height = page_wh_list.pop(0)
page_info = {'page_no': i, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict)
infer_results.append(InferenceResult(model_json, dataset))
return infer_results
def may_batch_image_analyze(
images_with_extra_info: list[(np.ndarray, bool, str)],
ocr: bool,
show_log: bool = False,
layout_model=None,
formula_enable=None,
table_enable=None):
# os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
from magic_pdf.model.batch_analyze import BatchAnalyze
model_manager = ModelSingleton()
# images = [image for image, _, _ in images_with_extra_info]
batch_ratio = 1
device = get_device()
if str(device).startswith('npu'):
import torch_npu
if torch_npu.npu.is_available():
torch.npu.set_compile_mode(jit_compile=False)
if str(device).startswith('npu') or str(device).startswith('cuda'):
vram = get_vram(device)
if vram is not None:
gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(vram)))
if gpu_memory >= 16:
batch_ratio = 16
elif gpu_memory >= 12:
batch_ratio = 8
elif gpu_memory >= 8:
batch_ratio = 4
elif gpu_memory >= 6:
batch_ratio = 2
else:
batch_ratio = 1
logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
else:
# Default batch_ratio when VRAM can't be determined
batch_ratio = 1
logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
# doc_analyze_start = time.time()
batch_model = BatchAnalyze(model_manager, batch_ratio, show_log, layout_model, formula_enable, table_enable)
results = batch_model(images_with_extra_info)
# gc_start = time.time()
clean_memory(get_device())
# gc_time = round(time.time() - gc_start, 2)
# logger.debug(f'gc time: {gc_time}')
# doc_analyze_time = round(time.time() - doc_analyze_start, 2)
# doc_analyze_speed = round(len(images) / doc_analyze_time, 2)
# logger.debug(
# f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
# f' speed: {doc_analyze_speed} pages/second'
# )
return results
\ No newline at end of file
# flake8: noqa
import os
import time
import cv2
import torch
import yaml
from loguru import logger
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
from magic_pdf.config.constants import *
from magic_pdf.model.model_list import AtomicModel
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.model.sub_modules.model_utils import (
clean_vram, crop_img, get_res_list_from_layout_res)
from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
get_adjusted_mfdetrec_res, get_ocr_result_list)
class CustomPEKModel:
def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
"""
======== model init ========
"""
# 获取当前文件(即 pdf_extract_kit.py)的绝对路径
current_file_path = os.path.abspath(__file__)
# 获取当前文件所在的目录(model)
current_dir = os.path.dirname(current_file_path)
# 上一级目录(magic_pdf)
root_dir = os.path.dirname(current_dir)
# model_config目录
model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
# 构建 model_configs.yaml 文件的完整路径
config_path = os.path.join(model_config_dir, 'model_configs.yaml')
with open(config_path, 'r', encoding='utf-8') as f:
self.configs = yaml.load(f, Loader=yaml.FullLoader)
# 初始化解析配置
# layout config
self.layout_config = kwargs.get('layout_config')
self.layout_model_name = self.layout_config.get(
'model', MODEL_NAME.DocLayout_YOLO
)
# formula config
self.formula_config = kwargs.get('formula_config')
self.mfd_model_name = self.formula_config.get(
'mfd_model', MODEL_NAME.YOLO_V8_MFD
)
self.mfr_model_name = self.formula_config.get(
'mfr_model', MODEL_NAME.UniMerNet_v2_Small
)
self.apply_formula = self.formula_config.get('enable', True)
# table config
self.table_config = kwargs.get('table_config')
self.apply_table = self.table_config.get('enable', False)
self.table_max_time = self.table_config.get('max_time', TABLE_MAX_TIME_VALUE)
self.table_model_name = self.table_config.get('model', MODEL_NAME.RAPID_TABLE)
self.table_sub_model_name = self.table_config.get('sub_model', None)
# ocr config
self.apply_ocr = ocr
self.lang = kwargs.get('lang', None)
logger.info(
'DocAnalysis init, this may take some times, layout_model: {}, apply_formula: {}, apply_ocr: {}, '
'apply_table: {}, table_model: {}, lang: {}'.format(
self.layout_model_name,
self.apply_formula,
self.apply_ocr,
self.apply_table,
self.table_model_name,
self.lang,
)
)
# 初始化解析方案
self.device = kwargs.get('device', 'cpu')
logger.info('using device: {}'.format(self.device))
models_dir = kwargs.get(
'models_dir', os.path.join(root_dir, 'resources', 'models')
)
logger.info('using models_dir: {}'.format(models_dir))
atom_model_manager = AtomModelSingleton()
# 初始化公式识别
if self.apply_formula:
# 初始化公式检测模型
self.mfd_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.MFD,
mfd_weights=str(
os.path.join(
models_dir, self.configs['weights'][self.mfd_model_name]
)
),
device=self.device,
)
# 初始化公式解析模型
mfr_weight_dir = str(
os.path.join(models_dir, self.configs['weights'][self.mfr_model_name])
)
mfr_cfg_path = str(os.path.join(model_config_dir, 'UniMERNet', 'demo.yaml'))
self.mfr_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.MFR,
mfr_weight_dir=mfr_weight_dir,
mfr_cfg_path=mfr_cfg_path,
device=self.device,
)
# 初始化layout模型
if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
self.layout_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.Layout,
layout_model_name=MODEL_NAME.LAYOUTLMv3,
layout_weights=str(
os.path.join(
models_dir, self.configs['weights'][self.layout_model_name]
)
),
layout_config_file=str(
os.path.join(
model_config_dir, 'layoutlmv3', 'layoutlmv3_base_inference.yaml'
)
),
device='cpu' if str(self.device).startswith("mps") else self.device,
)
elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
self.layout_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.Layout,
layout_model_name=MODEL_NAME.DocLayout_YOLO,
doclayout_yolo_weights=str(
os.path.join(
models_dir, self.configs['weights'][self.layout_model_name]
)
),
device=self.device,
)
# 初始化ocr
self.ocr_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.OCR,
ocr_show_log=show_log,
det_db_box_thresh=0.3,
lang=self.lang
)
# init table model
if self.apply_table:
table_model_dir = self.configs['weights'][self.table_model_name]
self.table_model = atom_model_manager.get_atom_model(
atom_model_name=AtomicModel.Table,
table_model_name=self.table_model_name,
table_model_path=str(os.path.join(models_dir, table_model_dir)),
table_max_time=self.table_max_time,
device=self.device,
ocr_engine=self.ocr_model,
table_sub_model_name=self.table_sub_model_name
)
logger.info('DocAnalysis init done!')
def __call__(self, image):
# layout检测
layout_start = time.time()
layout_res = []
if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
# layoutlmv3
layout_res = self.layout_model(image, ignore_catids=[])
elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
layout_res = self.layout_model.predict(image)
layout_cost = round(time.time() - layout_start, 2)
logger.info(f'layout detection time: {layout_cost}')
if self.apply_formula:
# 公式检测
mfd_start = time.time()
mfd_res = self.mfd_model.predict(image)
logger.info(f'mfd time: {round(time.time() - mfd_start, 2)}')
# 公式识别
mfr_start = time.time()
formula_list = self.mfr_model.predict(mfd_res, image)
layout_res.extend(formula_list)
mfr_cost = round(time.time() - mfr_start, 2)
logger.info(f'formula nums: {len(formula_list)}, mfr time: {mfr_cost}')
# 清理显存
clean_vram(self.device, vram_threshold=6)
# 从layout_res中获取ocr区域、表格区域、公式区域
ocr_res_list, table_res_list, single_page_mfdetrec_res = (
get_res_list_from_layout_res(layout_res)
)
# ocr识别
ocr_start = time.time()
# Process each area that requires OCR processing
for res in ocr_res_list:
new_image, useful_list = crop_img(res, image, crop_paste_x=50, crop_paste_y=50)
adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list)
# OCR recognition
new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
if self.apply_ocr:
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
else:
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res, rec=False)[0]
# Integration results
if ocr_res:
ocr_result_list = get_ocr_result_list(ocr_res, useful_list)
layout_res.extend(ocr_result_list)
ocr_cost = round(time.time() - ocr_start, 2)
if self.apply_ocr:
logger.info(f"ocr time: {ocr_cost}")
else:
logger.info(f"det time: {ocr_cost}")
# 表格识别 table recognition
if self.apply_table:
table_start = time.time()
for res in table_res_list:
new_image, _ = crop_img(res, image)
single_table_start_time = time.time()
html_code = None
if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
with torch.no_grad():
table_result = self.table_model.predict(new_image, 'html')
if len(table_result) > 0:
html_code = table_result[0]
elif self.table_model_name == MODEL_NAME.TABLE_MASTER:
html_code = self.table_model.img2html(new_image)
elif self.table_model_name == MODEL_NAME.RAPID_TABLE:
html_code, table_cell_bboxes, logic_points, elapse = self.table_model.predict(
new_image
)
run_time = time.time() - single_table_start_time
if run_time > self.table_max_time:
logger.warning(
f'table recognition processing exceeds max time {self.table_max_time}s'
)
# 判断是否返回正常
if html_code:
expected_ending = html_code.strip().endswith(
'</html>'
) or html_code.strip().endswith('</table>')
if expected_ending:
res['html'] = html_code
else:
logger.warning(
'table recognition processing fails, not found expected HTML table end'
)
else:
logger.warning(
'table recognition processing fails, not get html return'
)
logger.info(f'table time: {round(time.time() - table_start, 2)}')
return layout_res
import random
from loguru import logger
try:
from paddleocr import PPStructure
except ImportError:
logger.error('paddleocr not installed, please install by "pip install magic-pdf[lite]"')
exit(1)
def region_to_bbox(region):
x0 = region[0][0]
y0 = region[0][1]
x1 = region[2][0]
y1 = region[2][1]
return [x0, y0, x1, y1]
class CustomPaddleModel:
def __init__(self,
ocr: bool = False,
show_log: bool = False,
lang=None,
det_db_box_thresh=0.3,
use_dilation=True,
det_db_unclip_ratio=1.8
):
if lang is not None:
self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
lang=lang,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
else:
self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
def __call__(self, img):
try:
import cv2
except ImportError:
logger.error("opencv-python not installed, please install by pip.")
exit(1)
# 将RGB图片转换为BGR格式适配paddle
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
result = self.model(img)
spans = []
for line in result:
line.pop("img")
"""
为paddle输出适配type no.
title: 0 # 标题
text: 1 # 文本
header: 2 # abandon
footer: 2 # abandon
reference: 1 # 文本 or abandon
equation: 8 # 行间公式 block
equation: 14 # 行间公式 text
figure: 3 # 图片
figure_caption: 4 # 图片描述
table: 5 # 表格
table_caption: 6 # 表格描述
"""
if line["type"] == "title":
line["category_id"] = 0
elif line["type"] in ["text", "reference"]:
line["category_id"] = 1
elif line["type"] == "figure":
line["category_id"] = 3
elif line["type"] == "figure_caption":
line["category_id"] = 4
elif line["type"] == "table":
line["category_id"] = 5
elif line["type"] == "table_caption":
line["category_id"] = 6
elif line["type"] == "equation":
line["category_id"] = 8
elif line["type"] in ["header", "footer"]:
line["category_id"] = 2
else:
logger.warning(f"unknown type: {line['type']}")
# 兼容不输出score的paddleocr版本
if line.get("score") is None:
line["score"] = 0.5 + random.random() * 0.5
res = line.pop("res", None)
if res is not None and len(res) > 0:
for span in res:
new_span = {
"category_id": 15,
"bbox": region_to_bbox(span["text_region"]),
"score": span["confidence"],
"text": span["text"],
}
spans.append(new_span)
if len(spans) > 0:
result.extend(spans)
return result
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment