Unverified Commit 0c7a0882 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2611 from myhloli/dev

Dev
parents 3bd0ecf1 a392f445
import json
from io import BytesIO
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from .enum_class import BlockType, ContentType
def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
new_rgb = [float(color) / 255 for color in rgb_config]
page_data = bbox_list[i]
page_width, page_height = page.cropbox[2], page.cropbox[3]
for bbox in page_data:
width = bbox[2] - bbox[0]
height = bbox[3] - bbox[1]
rect = [bbox[0], page_height - bbox[3], width, height] # Define the rectangle
if fill_config: # filled rectangle
c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
else: # bounding box
c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
return c
def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
new_rgb = [float(color) / 255 for color in rgb_config]
page_data = bbox_list[i]
# 强制转换为 float
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
for j, bbox in enumerate(page_data):
# 确保bbox的每个元素都是float
x0, y0, x1, y1 = map(float, bbox)
width = x1 - x0
height = y1 - y0
rect = [x0, page_height - y1, width, height]
if draw_bbox:
if fill_config:
c.setFillColorRGB(*new_rgb, 0.3)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
else:
c.setStrokeColorRGB(*new_rgb)
c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
c.setFillColorRGB(*new_rgb, 1.0)
c.setFontSize(size=10)
# 这里也要用float
c.drawString(x1 + 2, page_height - y0 - 10, str(j + 1))
return c
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
dropped_bbox_list = []
tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = []
texts_list = []
interequations_list = []
lists_list = []
indexs_list = []
for page in pdf_info:
page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = []
texts = []
interequations = []
lists = []
indices = []
for dropped_bbox in page['discarded_blocks']:
page_dropped_list.append(dropped_bbox['bbox'])
dropped_bbox_list.append(page_dropped_list)
for block in page["para_blocks"]:
bbox = block["bbox"]
if block["type"] == BlockType.TABLE:
tables.append(bbox)
for nested_block in block["blocks"]:
bbox = nested_block["bbox"]
if nested_block["type"] == BlockType.TABLE_BODY:
tables_body.append(bbox)
elif nested_block["type"] == BlockType.TABLE_CAPTION:
tables_caption.append(bbox)
elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
tables_footnote.append(bbox)
elif block["type"] == BlockType.IMAGE:
imgs.append(bbox)
for nested_block in block["blocks"]:
bbox = nested_block["bbox"]
if nested_block["type"] == BlockType.IMAGE_BODY:
imgs_body.append(bbox)
elif nested_block["type"] == BlockType.IMAGE_CAPTION:
imgs_caption.append(bbox)
elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
imgs_footnote.append(bbox)
elif block["type"] == BlockType.TITLE:
titles.append(bbox)
elif block["type"] == BlockType.TEXT:
texts.append(bbox)
elif block["type"] == BlockType.INTERLINE_EQUATION:
interequations.append(bbox)
elif block["type"] == BlockType.LIST:
lists.append(bbox)
elif block["type"] == BlockType.INDEX:
indices.append(bbox)
tables_list.append(tables)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_list.append(imgs)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
lists_list.append(lists)
indexs_list.append(indices)
layout_bbox_list = []
table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
for page in pdf_info:
page_block_list = []
for block in page["para_blocks"]:
if block["type"] in [
BlockType.TEXT,
BlockType.TITLE,
BlockType.INTERLINE_EQUATION,
BlockType.LIST,
BlockType.INDEX,
]:
bbox = block["bbox"]
page_block_list.append(bbox)
elif block["type"] in [BlockType.IMAGE]:
for sub_block in block["blocks"]:
bbox = sub_block["bbox"]
page_block_list.append(bbox)
elif block["type"] in [BlockType.TABLE]:
sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
for sub_block in sorted_blocks:
bbox = sub_block["bbox"]
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list)
pdf_bytes_io = BytesIO(pdf_bytes)
pdf_docs = PdfReader(pdf_bytes_io)
output_pdf = PdfWriter()
for i, page in enumerate(pdf_docs.pages):
# 获取原始页面尺寸
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
custom_page_size = (page_width, page_height)
packet = BytesIO()
# 使用原始PDF的尺寸创建canvas
c = canvas.Canvas(packet, pagesize=custom_page_size)
c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
c.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
page.merge_page(overlay_pdf.pages[0])
output_pdf.add_page(page)
# 保存结果
with open(f"{out_path}/{filename}", "wb") as f:
output_pdf.write(f)
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
text_list = []
inline_equation_list = []
interline_equation_list = []
image_list = []
table_list = []
dropped_list = []
next_page_text_list = []
next_page_inline_equation_list = []
def get_span_info(span):
if span['type'] == ContentType.TEXT:
if span.get('cross_page', False):
next_page_text_list.append(span['bbox'])
else:
page_text_list.append(span['bbox'])
elif span['type'] == ContentType.INLINE_EQUATION:
if span.get('cross_page', False):
next_page_inline_equation_list.append(span['bbox'])
else:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.INTERLINE_EQUATION:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.IMAGE:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.TABLE:
page_table_list.append(span['bbox'])
for page in pdf_info:
page_text_list = []
page_inline_equation_list = []
page_interline_equation_list = []
page_image_list = []
page_table_list = []
page_dropped_list = []
# 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list)
next_page_text_list.clear()
if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list.clear()
# 构造dropped_list
for block in page['discarded_blocks']:
if block['type'] == BlockType.DISCARDED:
for line in block['lines']:
for span in line['spans']:
page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for block in page['preproc_blocks']:
if block['type'] in [
BlockType.TEXT,
BlockType.TITLE,
BlockType.INTERLINE_EQUATION,
BlockType.LIST,
BlockType.INDEX,
]:
for line in block['lines']:
for span in line['spans']:
get_span_info(span)
elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
for sub_block in block['blocks']:
for line in sub_block['lines']:
for span in line['spans']:
get_span_info(span)
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list)
image_list.append(page_image_list)
table_list.append(page_table_list)
pdf_bytes_io = BytesIO(pdf_bytes)
pdf_docs = PdfReader(pdf_bytes_io)
output_pdf = PdfWriter()
for i, page in enumerate(pdf_docs.pages):
# 获取原始页面尺寸
page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
custom_page_size = (page_width, page_height)
packet = BytesIO()
# 使用原始PDF的尺寸创建canvas
c = canvas.Canvas(packet, pagesize=custom_page_size)
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
c.save()
packet.seek(0)
overlay_pdf = PdfReader(packet)
page.merge_page(overlay_pdf.pages[0])
output_pdf.add_page(page)
# Save the PDF
with open(f"{out_path}/{filename}", "wb") as f:
output_pdf.write(f)
if __name__ == "__main__":
# 读取PDF文件
pdf_path = "examples/demo1.pdf"
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
# 从json文件读取pdf_info
json_path = "examples/demo1_1746005777.0863056_middle.json"
with open(json_path, "r", encoding="utf-8") as f:
pdf_ann = json.load(f)
pdf_info = pdf_ann["pdf_info"]
# 调用可视化函数,输出到examples目录
draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")
class BlockType:
IMAGE = 'image'
TABLE = 'table'
IMAGE_BODY = 'image_body'
TABLE_BODY = 'table_body'
IMAGE_CAPTION = 'image_caption'
TABLE_CAPTION = 'table_caption'
IMAGE_FOOTNOTE = 'image_footnote'
TABLE_FOOTNOTE = 'table_footnote'
TEXT = 'text'
TITLE = 'title'
INTERLINE_EQUATION = 'interline_equation'
LIST = 'list'
INDEX = 'index'
DISCARDED = 'discarded'
class ContentType:
IMAGE = 'image'
TABLE = 'table'
TEXT = 'text'
INTERLINE_EQUATION = 'interline_equation'
INLINE_EQUATION = 'inline_equation'
class CategoryId:
Title = 0
Text = 1
Abandon = 2
ImageBody = 3
ImageCaption = 4
TableBody = 5
TableCaption = 6
TableFootnote = 7
InterlineEquation_Layout = 8
InlineEquation = 13
InterlineEquation_YOLO = 14
OcrText = 15
ImageFootnote = 101
class MakeMode:
MM_MD = 'mm_markdown'
NLP_MD = 'nlp_markdown'
CONTENT_LIST = 'content_list'
class ModelPath:
vlm_root_hf = "opendatalab/MinerU2.0-2505-0.9B"
vlm_root_modelscope = "OpenDataLab/MinerU2.0-2505-0.9B"
pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0"
pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0"
doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
yolo_v8_mfd = "models/MFD/YOLO/yolo_v8_ft.pt"
unimernet_small = "models/MFR/unimernet_hf_small_2503"
pytorch_paddle = "models/OCR/paddleocr_torch"
layout_reader = "models/ReadingOrder/layout_reader"
slanet_plus = "models/TabRec/SlanetPlus/slanet-plus.onnx"
class SplitFlag:
CROSS_PAGE = 'cross_page'
LINES_DELETED = 'lines_deleted'
\ No newline at end of file
import re
import itertools
import html
from typing import Any, Dict, List
from pydantic import (
BaseModel,
computed_field,
model_validator,
)
class TableCell(BaseModel):
"""TableCell."""
row_span: int = 1
col_span: int = 1
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False
@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
"""from_dict_format."""
if isinstance(data, Dict):
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
if (
# "bbox" not in data
# or data["bbox"] is None
# or isinstance(data["bbox"], BoundingBox)
"text"
in data
):
return data
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "
text = text.strip()
data["text"] = text
return data
class TableData(BaseModel): # TBD
"""BaseTableData."""
table_cells: List[TableCell] = []
num_rows: int = 0
num_cols: int = 0
@computed_field # type: ignore
@property
def grid(
self,
) -> List[List[TableCell]]:
"""grid."""
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
)
for j in range(self.num_cols)
]
for i in range(self.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in self.table_cells:
for i in range(
min(cell.start_row_offset_idx, self.num_rows),
min(cell.end_row_offset_idx, self.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, self.num_cols),
min(cell.end_col_offset_idx, self.num_cols),
):
table_data[i][j] = cell
return table_data
"""
OTSL
"""
OTSL_NL = "<nl>"
OTSL_FCEL = "<fcel>"
OTSL_ECEL = "<ecel>"
OTSL_LCEL = "<lcel>"
OTSL_UCEL = "<ucel>"
OTSL_XCEL = "<xcel>"
def otsl_extract_tokens_and_text(s: str):
# Pattern to match anything enclosed by < >
# (including the angle brackets themselves)
# pattern = r"(<[^>]+>)"
pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
tokens = re.findall(pattern, s)
# Remove any tokens that start with "<loc_"
tokens = [token for token in tokens]
# Split the string by those tokens to get the in-between text
text_parts = re.split(pattern, s)
text_parts = [token for token in text_parts]
# Remove any empty or purely whitespace strings from text_parts
text_parts = [part for part in text_parts if part.strip()]
return tokens, text_parts
def otsl_parse_texts(texts, tokens):
split_word = OTSL_NL
split_row_tokens = [
list(y)
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
if not x
]
table_cells = []
r_idx = 0
c_idx = 0
def count_right(tokens, c_idx, r_idx, which_tokens):
span = 0
c_idx_iter = c_idx
while tokens[r_idx][c_idx_iter] in which_tokens:
c_idx_iter += 1
span += 1
if c_idx_iter >= len(tokens[r_idx]):
return span
return span
def count_down(tokens, c_idx, r_idx, which_tokens):
span = 0
r_idx_iter = r_idx
while tokens[r_idx_iter][c_idx] in which_tokens:
r_idx_iter += 1
span += 1
if r_idx_iter >= len(tokens):
return span
return span
for i, text in enumerate(texts):
cell_text = ""
if text in [
OTSL_FCEL,
OTSL_ECEL,
]:
row_span = 1
col_span = 1
right_offset = 1
if text != OTSL_ECEL:
cell_text = texts[i + 1]
right_offset = 2
# Check next element(s) for lcel / ucel / xcel,
# set properly row_span, col_span
next_right_cell = ""
if i + right_offset < len(texts):
next_right_cell = texts[i + right_offset]
next_bottom_cell = ""
if r_idx + 1 < len(split_row_tokens):
if c_idx < len(split_row_tokens[r_idx + 1]):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
if next_right_cell in [
OTSL_LCEL,
OTSL_XCEL,
]:
# we have horisontal spanning cell or 2d spanning cell
col_span += count_right(
split_row_tokens,
c_idx + 1,
r_idx,
[OTSL_LCEL, OTSL_XCEL],
)
if next_bottom_cell in [
OTSL_UCEL,
OTSL_XCEL,
]:
# we have a vertical spanning cell or 2d spanning cell
row_span += count_down(
split_row_tokens,
c_idx,
r_idx + 1,
[OTSL_UCEL, OTSL_XCEL],
)
table_cells.append(
TableCell(
text=cell_text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=r_idx,
end_row_offset_idx=r_idx + row_span,
start_col_offset_idx=c_idx,
end_col_offset_idx=c_idx + col_span,
)
)
if text in [
OTSL_FCEL,
OTSL_ECEL,
OTSL_LCEL,
OTSL_UCEL,
OTSL_XCEL,
]:
c_idx += 1
if text == OTSL_NL:
r_idx += 1
c_idx = 0
return table_cells, split_row_tokens
def export_to_html(table_data: TableData):
nrows = table_data.num_rows
ncols = table_data.num_cols
text = ""
if len(table_data.table_cells) == 0:
return ""
body = ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
cell: TableCell = table_data.grid[i][j]
rowspan, rowstart = (
cell.row_span,
cell.start_row_offset_idx,
)
colspan, colstart = (
cell.col_span,
cell.start_col_offset_idx,
)
if rowstart != i:
continue
if colstart != j:
continue
content = html.escape(cell.text.strip())
celltag = "td"
if cell.column_header:
celltag = "th"
opening_tag = f"{celltag}"
if rowspan > 1:
opening_tag += f' rowspan="{rowspan}"'
if colspan > 1:
opening_tag += f' colspan="{colspan}"'
body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"
# dir = get_text_direction(text)
body = f"<table>{body}</table>"
return body
def convert_otsl_to_html(otsl_content: str):
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
table_data = TableData(
num_rows=len(split_row_tokens),
num_cols=(
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
),
table_cells=table_cells,
)
return export_to_html(table_data)
# Copyright (c) Opendatalab. All rights reserved.
import hashlib
import json
def compute_md5(file_bytes):
def bytes_md5(file_bytes):
hasher = hashlib.md5()
hasher.update(file_bytes)
return hasher.hexdigest().upper()
def compute_sha256(input_string):
def str_md5(input_string):
hasher = hashlib.md5()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
def str_sha256(input_string):
hasher = hashlib.sha256()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
def dict_md5(d):
json_str = json.dumps(d, sort_keys=True, ensure_ascii=False)
return hashlib.md5(json_str.encode('utf-8')).hexdigest()
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import json
from loguru import logger
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from openai import OpenAI
import ast
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import merge_para_with_text
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
1. 修正渲染或编译错误:
- Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
- 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
2. 保留原始信息:
- 保留原始公式中的所有重要信息
- 不要添加任何原始公式中没有的新信息
IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。
LaTeX recognition result:
$FORMULA
Your corrected result:
"""
text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
1. 修正OCR引起的拼写错误和错误:
- 修正常见的OCR错误(例如,'rn' 被误读为 'm')
- 使用上下文和常识进行修正
- 只修正明显的错误,不要不必要的修改内容
- 不要添加额外的句号或其他不必要的标点符号
2. 保持原始结构:
- 保留所有标题和子标题
3. 保留原始内容:
- 保留原始文本中的所有重要信息
- 不要添加任何原始文本中没有的新信息
- 保留段落之间的换行符
4. 保持连贯性:
- 确保内容与前文顺畅连接
- 适当处理在句子中间开始或结束的文本
5. 修正行内公式:
- 去除行内公式前后多余的空格
- 修正公式中的OCR错误
- 确保公式能够通过KaTeX渲染
6. 修正全角字符
- 修正全角标点符号为半角标点符号
- 修正全角字母为半角字母
- 修正全角数字为半角数字
IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。
Previous context:
Current chunk to process:
Corrected text:
"""
def llm_aided_formula(pdf_info_dict, formula_aided_config):
pass
def llm_aided_text(pdf_info_dict, text_aided_config):
pass
def llm_aided_title(pdf_info_dict, title_aided_config):
def llm_aided_title(page_info_list, title_aided_config):
client = OpenAI(
api_key=title_aided_config["api_key"],
base_url=title_aided_config["base_url"],
......@@ -78,8 +14,8 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
title_dict = {}
origin_title_list = []
i = 0
for page_num, page in pdf_info_dict.items():
blocks = page["para_blocks"]
for page_info in page_info_list:
blocks = page_info["para_blocks"]
for block in blocks:
if block["type"] == "title":
origin_title_list.append(block)
......@@ -92,7 +28,7 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
line_avg_height = sum(page_line_height_list) / len(page_line_height_list)
else:
line_avg_height = int(block['bbox'][3] - block['bbox'][1])
title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1]
title_dict[f"{i}"] = [title_text, line_avg_height, int(page_info['page_idx']) + 1]
i += 1
# logger.info(f"Title list: {title_dict}")
......@@ -115,16 +51,21 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
- 标题从前至后的层级必须是连续的,不能跳过层级
- 标题层级最多为4级,不要添加过多的层级
- 优化后的标题只保留代表该标题的层级的整数,不要保留其他信息
5. 合理性检查与微调:
- 在完成初步分级后,仔细检查分级结果的合理性
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
- 确保最终的分级结果符合文档的实际结构和逻辑
- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
IMPORTANT:
请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
{{0:1,1:2,2:2,3:3}}
{{
0:1,
1:2,
2:2,
3:3
}}
不需要对字典格式化,不需要返回任何其他信息。
Input title list:
......@@ -145,16 +86,23 @@ Corrected title list:
{'role': 'user', 'content': title_optimize_prompt}],
temperature=0.7,
)
# logger.info(f"Title completion: {completion.choices[0].message.content}")
dict_completion = ast.literal_eval(completion.choices[0].message.content)
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
content = completion.choices[0].message.content.strip()
# logger.info(f"Title completion: {content}")
if "</think>" in content:
idx = content.index("</think>") + len("</think>")
content = content[idx:].strip()
import json_repair
dict_completion = json_repair.loads(content)
dict_completion = {int(k): int(v) for k, v in dict_completion.items()}
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
if len(dict_completion) == len(title_dict):
for i, origin_title_block in enumerate(origin_title_list):
origin_title_block["level"] = int(dict_completion[i])
break
else:
logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
logger.warning(
"The number of titles in the optimized result is not equal to the number of titles in the input.")
retry_count += 1
except Exception as e:
logger.exception(e)
......
import time
import torch
import gc
from PIL import Image
from loguru import logger
import numpy as np
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.clean_memory import clean_memory
from mineru.utils.boxbase import get_minbox_if_overlap_by_ratio
def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
def crop_img(input_res, input_img, crop_paste_x=0, crop_paste_y=0):
crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
......@@ -16,15 +17,24 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
# Create a white background array
return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
if isinstance(input_img, np.ndarray):
# Crop the original image using numpy slicing
cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
# Create a white background array
return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
# Paste the cropped image onto the white background
return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
# Crop the original image using numpy slicing
cropped_img = input_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
# Paste the cropped image onto the white background
return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
else:
# Create a white background array
return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
# Crop image
crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
cropped_img = input_img.crop(crop_box)
return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
crop_new_height]
......@@ -287,6 +297,20 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
def clean_memory(device='cuda'):
if device == 'cuda':
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
elif str(device).startswith("npu"):
import torch_npu
if torch_npu.npu.is_available():
torch_npu.npu.empty_cache()
elif str(device).startswith("mps"):
torch.mps.empty_cache()
gc.collect()
def clean_vram(device, vram_threshold=8):
total_memory = get_vram(device)
if total_memory and total_memory <= vram_threshold:
......
import os
from huggingface_hub import snapshot_download as hf_snapshot_download
from modelscope import snapshot_download as ms_snapshot_download
from mineru.utils.config_reader import get_local_models_dir
from mineru.utils.enum_class import ModelPath
def auto_download_and_get_model_root_path(relative_path: str, repo_mode='pipeline') -> str:
"""
支持文件或目录的可靠下载。
- 如果输入文件: 返回本地文件绝对路径
- 如果输入目录: 返回本地缓存下与 relative_path 同结构的相对路径字符串
:param repo_mode: 指定仓库模式,'pipeline' 或 'vlm'
:param relative_path: 文件或目录相对路径
:return: 本地文件绝对路径或相对路径
"""
model_source = os.getenv('MINERU_MODEL_SOURCE', "huggingface")
if model_source == 'local':
local_models_config = get_local_models_dir()
root_path = local_models_config.get(repo_mode, None)
if not root_path:
raise ValueError(f"Local path for repo_mode '{repo_mode}' is not configured.")
return root_path
# 建立仓库模式到路径的映射
repo_mapping = {
'pipeline': {
'huggingface': ModelPath.pipeline_root_hf,
'modelscope': ModelPath.pipeline_root_modelscope,
'default': ModelPath.pipeline_root_hf
},
'vlm': {
'huggingface': ModelPath.vlm_root_hf,
'modelscope': ModelPath.vlm_root_modelscope,
'default': ModelPath.vlm_root_hf
}
}
if repo_mode not in repo_mapping:
raise ValueError(f"Unsupported repo_mode: {repo_mode}, must be 'pipeline' or 'vlm'")
# 如果没有指定model_source或值不是'modelscope',则使用默认值
repo = repo_mapping[repo_mode].get(model_source, repo_mapping[repo_mode]['default'])
if model_source == "huggingface":
snapshot_download = hf_snapshot_download
elif model_source == "modelscope":
snapshot_download = ms_snapshot_download
else:
raise ValueError(f"未知的仓库类型: {model_source}")
relative_path = relative_path.strip('/')
cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"])
return cache_dir
if __name__ == '__main__':
path1 = "models/README.md"
root = auto_download_and_get_model_root_path(path1)
print("本地文件绝对路径:", os.path.join(root, path1))
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import copy
import cv2
import numpy as np
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
def merge_spans_to_line(spans, threshold=0.6):
if len(spans) == 0:
return []
else:
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line)
return lines
def __is_overlaps_y_exceeds_threshold(bbox1,
bbox2,
overlap_ratio_threshold=0.8):
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
_, y0_1, _, y1_1 = bbox1
_, y0_2, _, y1_2 = bbox2
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
# max_height = max(height1, height2)
min_height = min(height1, height2)
return (overlap / min_height) > overlap_ratio_threshold
def img_decode(content: bytes):
......@@ -212,10 +248,7 @@ def merge_det_boxes(dt_boxes):
angle_boxes_list.append(text_box)
continue
text_box_dict = {
'bbox': text_bbox,
'type': 'text',
}
text_box_dict = {'bbox': text_bbox}
dt_boxes_dict_list.append(text_box_dict)
# Merge adjacent text regions into lines
......
# Copyright (c) Opendatalab. All rights reserved.
import re
from io import BytesIO
import numpy as np
import pypdfium2 as pdfium
from loguru import logger
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
from pypdf import PdfReader
def classify(pdf_bytes):
"""
判断PDF文件是可以直接提取文本还是需要OCR
Args:
pdf_bytes: PDF文件的字节数据
Returns:
str: 'txt' 表示可以直接提取文本,'ocr' 表示需要OCR
"""
try:
# 从字节数据加载PDF
sample_pdf_bytes = extract_pages(pdf_bytes)
pdf = pdfium.PdfDocument(sample_pdf_bytes)
# 获取PDF页数
page_count = len(pdf)
# 如果PDF页数为0,直接返回OCR
if page_count == 0:
return 'ocr'
# 检查的页面数(最多检查10页)
pages_to_check = min(page_count, 10)
# 设置阈值:如果每页平均少于50个有效字符,认为需要OCR
chars_threshold = 50
if (get_avg_cleaned_chars_per_page(pdf, pages_to_check) < chars_threshold) or detect_invalid_chars(sample_pdf_bytes):
return 'ocr'
else:
if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.9:
return 'ocr'
return 'txt'
except Exception as e:
logger.error(f"判断PDF类型时出错: {e}")
# 出错时默认使用OCR
return 'ocr'
def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
# 总字符数
total_chars = 0
# 清理后的总字符数
cleaned_total_chars = 0
# 检查前几页的文本
for i in range(pages_to_check):
page = pdf_doc[i]
text_page = page.get_textpage()
text = text_page.get_text_bounded()
total_chars += len(text)
# 清理提取的文本,移除空白字符
cleaned_text = re.sub(r'\s+', '', text)
cleaned_total_chars += len(cleaned_text)
# 计算平均每页字符数
avg_cleaned_chars_per_page = cleaned_total_chars / pages_to_check
# logger.debug(f"PDF分析: 平均每页清理后{avg_cleaned_chars_per_page:.1f}字符")
pdf_doc.close() # 关闭PDF文档
return avg_cleaned_chars_per_page
def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
pdf_stream = BytesIO(sample_pdf_bytes)
pdf_reader = PdfReader(pdf_stream)
# 记录高图像覆盖率的页面数量
high_image_coverage_pages = 0
# 检查前几页的图像
for i in range(pages_to_check):
page = pdf_reader.pages[i]
# 获取页面尺寸
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
page_area = page_width * page_height
# 估算图像覆盖率
image_area = 0
if '/Resources' in page:
resources = page['/Resources']
if '/XObject' in resources:
x_objects = resources['/XObject']
# 计算所有图像对象占据的面积
for obj_name in x_objects:
try:
obj = x_objects[obj_name]
if obj['/Subtype'] == '/Image':
# 获取图像宽高
width = obj.get('/Width', 0)
height = obj.get('/Height', 0)
# 计算图像在页面上的估计面积
# 注意:这是估计值,因为没有考虑图像变换矩阵
scale_factor = 1.0 # 估计缩放因子
img_area = width * height * scale_factor
image_area += img_area
except Exception as e:
# logger.debug(f"处理图像对象时出错: {e}")
continue
# 估算图像覆盖率
estimated_coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
# logger.debug(f"PDF分析: 页面 {i + 1} 图像覆盖率: {estimated_coverage:.2f}")
# 基于估计的图像覆盖率
if estimated_coverage >= 1:
# 如果图像覆盖率超过80%,认为是高图像覆盖率页面
high_image_coverage_pages += 1
# 计算高图像覆盖页面比例
high_image_coverage_ratio = high_image_coverage_pages / pages_to_check
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_image_coverage_ratio:.2f}")
pdf_stream.close() # 关闭字节流
pdf_reader.close()
return high_image_coverage_ratio
def extract_pages(src_pdf_bytes: bytes) -> bytes:
"""
从PDF字节数据中随机提取最多10页,返回新的PDF字节数据
Args:
src_pdf_bytes: PDF文件的字节数据
Returns:
bytes: 提取页面后的PDF字节数据
"""
# 从字节数据加载PDF
pdf = pdfium.PdfDocument(src_pdf_bytes)
# 获取PDF页数
total_page = len(pdf)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return b''
# 选择最多10页
select_page_cnt = min(10, total_page)
# 从总页数中随机选择页面
page_indices = np.random.choice(total_page, select_page_cnt, replace=False).tolist()
# 创建一个新的PDF文档
sample_docs = pdfium.PdfDocument.new()
try:
# 将选择的页面导入新文档
sample_docs.import_pages(pdf, page_indices)
# 将新PDF保存到内存缓冲区
output_buffer = BytesIO()
sample_docs.save(output_buffer)
# 获取字节数据
return output_buffer.getvalue()
except Exception as e:
logger.exception(e)
return b'' # 出错时返回空字节
def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_pdf_bytes = extract_pages(src_pdf_bytes)
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0:
cid_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.debug(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
return True # 乱码文档
else:
return False # 正常文档
if __name__ == '__main__':
with open('/Users/myhloli/pdf/luanma2x10.pdf', 'rb') as f:
p_bytes = f.read()
logger.info(f"PDF分类结果: {classify(p_bytes)}")
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
from io import BytesIO
import pypdfium2 as pdfium
from loguru import logger
from PIL import Image
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
from .hash_utils import str_sha256
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200) -> dict:
"""Convert pdfium.PdfDocument to image, Then convert the image to base64.
Args:
page (_type_): pdfium.PdfPage
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float }
"""
pil_img, scale = page_to_image(page, dpi=dpi)
img_base64 = image_to_b64str(pil_img)
image_dict = {
"img_base64": img_base64,
"img_pil": pil_img,
"scale": scale,
}
return image_dict
def load_images_from_pdf(
pdf_bytes: bytes,
dpi=200,
start_page_id=0,
end_page_id=None,
):
images_list = []
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_page_num = len(pdf_doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
for index in range(0, pdf_page_num):
if start_page_id <= index <= end_page_id:
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi)
images_list.append(image_dict)
return images_list, pdf_doc
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
图片存放在save_path下,文件名是:
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
# 拼接文件名
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 老版本返回不带bucket的路径
img_path = f"{return_path}_{filename}" if return_path is not None else None
# 新版本生成平铺路径
img_hash256_path = f"{str_sha256(img_path)}.jpg"
# img_hash256_path = f'{img_path}.jpg'
crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
img_bytes = image_to_bytes(crop_img, image_format="JPEG")
image_writer.write(img_hash256_path, img_bytes)
return img_hash256_path
def get_crop_img(bbox: tuple, pil_img, scale=2):
scale_bbox = (
int(bbox[0] * scale),
int(bbox[1] * scale),
int(bbox[2] * scale),
int(bbox[3] * scale),
)
return pil_img.crop(scale_bbox)
def images_bytes_to_pdf_bytes(image_bytes):
# 内存缓冲区
pdf_buffer = BytesIO()
# 载入并转换所有图像为 RGB 模式
image = Image.open(BytesIO(image_bytes)).convert("RGB")
# 第一张图保存为 PDF,其余追加
image.save(pdf_buffer, format="PDF", save_all=True)
# 获取 PDF bytes 并重置指针(可选)
pdf_bytes = pdf_buffer.getvalue()
pdf_buffer.close()
return pdf_bytes
# Copyright (c) Opendatalab. All rights reserved.
import base64
from io import BytesIO
from loguru import logger
from PIL import Image
from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
def page_to_image(
page: PdfPage,
dpi: int = 144, # changed from 200 to 144
max_width_or_height: int = 2560, # changed from 4500 to 2560
) -> (Image.Image, float):
scale = dpi / 72
long_side_length = max(*page.get_size())
if long_side_length > max_width_or_height:
scale = max_width_or_height / long_side_length
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
try:
image = bitmap.to_pil()
finally:
try:
bitmap.close()
except Exception:
pass
return image, scale
def image_to_bytes(
image: Image.Image,
image_format: str = "PNG", # 也可以用 "JPEG"
) -> bytes:
with BytesIO() as image_buffer:
image.save(image_buffer, format=image_format)
return image_buffer.getvalue()
def image_to_b64str(
image: Image.Image,
image_format: str = "PNG", # 也可以用 "JPEG"
) -> str:
image_bytes = image_to_bytes(image, image_format)
return base64.b64encode(image_bytes).decode("utf-8")
def pdf_to_images(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
) -> list[Image.Image]:
doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
page_num = len(doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
if end_page_id > page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = page_num - 1
images = []
try:
for i in range(start_page_id, end_page_id + 1):
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
images.append(image)
finally:
try:
doc.close()
except Exception:
pass
return images
def pdf_to_images_bytes(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
image_format: str = "PNG",
) -> list[bytes]:
images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
return [image_to_bytes(image, image_format) for image in images]
def pdf_to_images_b64strs(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
image_format: str = "PNG",
) -> list[str]:
images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
return [image_to_b64str(image, image_format) for image in images]
from typing import List
import math
import pypdfium2 as pdfium
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
def get_page(
page: pdfium.PdfPage,
quote_loosebox: bool =True,
superscript_height_threshold: float = 0.7,
line_distance_threshold: float = 0.1,
) -> dict:
textpage = page.get_textpage()
page_bbox: List[float] = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
page_rotation = 0
try:
page_rotation = page.get_rotation()
except:
pass
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
lines = get_lines(spans)
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
blocks = get_blocks(lines)
page = {
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"rotation": page_rotation,
"blocks": blocks
}
return page
\ No newline at end of file
import asyncio
import threading
from queue import Queue
from typing import Any, AsyncIterable, Coroutine, Iterable, TypeVar
T = TypeVar("T")
def run_async(coroutine: Coroutine[Any, Any, T]) -> T:
if not asyncio.iscoroutine(coroutine):
raise ValueError("a coroutine was expected, got {!r}".format(coroutine))
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop is not None:
return loop.run_until_complete(coroutine)
else:
return asyncio.run(coroutine)
def iter_async(iterable: AsyncIterable[T]) -> Iterable[T]:
if not isinstance(iterable, AsyncIterable):
raise ValueError("an async iterable was expected, got {!r}".format(iterable))
queue = Queue()
async def async_helper():
try:
async for chunk in iterable:
queue.put(chunk)
queue.put(None)
except Exception as e:
queue.put(e)
def helper():
run_async(async_helper())
thread = threading.Thread(target=helper, daemon=True)
thread.start()
while True:
chunk = queue.get()
if chunk is None:
break
if isinstance(chunk, Exception):
raise chunk
yield chunk
thread.join()
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
# Copyright (c) Opendatalab. All rights reserved.
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from mineru.utils.enum_class import BlockType, ContentType
from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
if block_type in [
BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
]:
block_dict['group_id'] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(
span['type'], block_type):
block_spans.append(span)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
return block_type in [
BlockType.TEXT,
BlockType.TITLE,
BlockType.IMAGE_CAPTION,
BlockType.IMAGE_FOOTNOTE,
BlockType.TABLE_CAPTION,
BlockType.TABLE_FOOTNOTE,
BlockType.DISCARDED
]
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
elif span_type == ContentType.INTERLINE_EQUATION:
return block_type in [BlockType.INTERLINE_EQUATION, BlockType.TEXT]
elif span_type == ContentType.IMAGE:
return block_type in [BlockType.IMAGE_BODY]
elif span_type == ContentType.TABLE:
return block_type in [BlockType.TABLE_BODY]
else:
return False
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.INTERLINE_EQUATION:
span['type'] = ContentType.INLINE_EQUATION
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def merge_spans_to_line(spans, threshold=0.6):
......@@ -34,11 +91,11 @@ def merge_spans_to_line(spans, threshold=0.6):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] for s in current_line):
# 则开始新行
lines.append(current_line)
......@@ -60,70 +117,36 @@ def merge_spans_to_line(spans, threshold=0.6):
return lines
def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.Text, ContentType.InlineEquation]:
return block_type in [
BlockType.Text,
BlockType.Title,
BlockType.ImageCaption,
BlockType.ImageFootnote,
BlockType.TableCaption,
BlockType.TableFootnote,
BlockType.Discarded
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
elif span_type == ContentType.InterlineEquation:
return block_type in [BlockType.InterlineEquation, BlockType.Text]
elif span_type == ContentType.Image:
return block_type in [BlockType.ImageBody]
elif span_type == ContentType.Table:
return block_type in [BlockType.TableBody]
else:
return False
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict['group_id'] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
block_spans.append(span)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
def fix_block_spans_v2(block_with_spans):
def fix_block_spans(block_with_spans):
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type in [BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
if block_type in [BlockType.TEXT, BlockType.TITLE,
BlockType.IMAGE_CAPTION, BlockType.IMAGE_CAPTION,
BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
]:
block = fix_text_block(block)
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
elif block_type in [BlockType.INTERLINE_EQUATION, BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
block = fix_interline_block(block)
else:
continue
......@@ -131,29 +154,9 @@ def fix_block_spans_v2(block_with_spans):
return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
return block
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import re
import statistics
import cv2
import numpy as np
from loguru import logger
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \
get_minbox_if_overlap_by_ratio
from mineru.utils.enum_class import BlockType, ContentType
from mineru.utils.pdf_image_tools import get_crop_img
from mineru.utils.pdf_text_tool import get_page
def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
def get_block_bboxes(blocks, block_type_list):
return [block[0:4] for block in blocks if block[7] in block_type_list]
image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY])
table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY])
other_block_type = []
for block_type in BlockType.__dict__.values():
if not isinstance(block_type, str):
continue
if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
other_block_type.append(block_type)
other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED])
new_spans = []
for span in spans:
span_bbox = span['bbox']
span_type = span['type']
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
discarded_block_bboxes):
new_spans.append(span)
continue
if span_type == ContentType.IMAGE:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
image_bboxes):
new_spans.append(span)
elif span_type == ContentType.TABLE:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
table_bboxes):
new_spans.append(span)
else:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
other_block_bboxes):
new_spans.append(span)
return new_spans
def remove_overlaps_low_confidence_spans(spans):
dropped_spans = []
# 删除重叠spans中置信度低的的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span1['score'] < span2['score']:
span_need_remove = span1
else:
span_need_remove = span2
if (
span_need_remove is not None
and span_need_remove not in dropped_spans
):
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
return spans, dropped_spans
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
return spans, dropped_spans
def __replace_ligatures(text: str):
ligatures = {
'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
def __replace_unicode(text: str):
ligatures = {
'\r\n': '', '\u0002': '-',
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
"""pdf_text dict方案 char级别"""
def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded_blocks):
page_dict = get_page(pdf_page)
page_all_chars = []
page_all_lines = []
for block in page_dict['blocks']:
for line in block['lines']:
if 0 < abs(line['rotation']) < 90:
# 旋转角度在0-90度之间的行,直接跳过
continue
page_all_lines.append(line)
for span in line['spans']:
for char in span['chars']:
page_all_chars.append(char)
# 计算所有sapn的高度的中位数
span_height_list = []
for span in spans:
if span['type'] in [ContentType.TEXT]:
span_height = span['bbox'][3] - span['bbox'][1]
span['height'] = span_height
span['width'] = span['bbox'][2] - span['bbox'][0]
span_height_list.append(span_height)
if len(span_height_list) == 0:
return spans
else:
median_span_height = statistics.median(span_height_list)
useful_spans = []
unuseful_spans = []
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
vertical_spans = []
for span in spans:
if span['type'] in [ContentType.TEXT]:
for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
vertical_spans.append(span)
elif block in all_bboxes:
useful_spans.append(span)
else:
unuseful_spans.append(span)
break
"""垂直的span框直接用line进行填充"""
if len(vertical_spans) > 0:
for pdfium_line in page_all_lines:
for span in vertical_spans:
if calculate_overlap_area_in_bbox1_area_ratio(pdfium_line['bbox'].bbox, span['bbox']) > 0.5:
for pdfium_span in pdfium_line['spans']:
span['content'] += pdfium_span['text']
break
for span in vertical_spans:
if len(span['content']) == 0:
spans.remove(span)
"""水平的span框先用char填充,再用ocr填充空的span框"""
new_spans = []
for span in useful_spans + unuseful_spans:
if span['type'] in [ContentType.TEXT]:
span['chars'] = []
new_spans.append(span)
need_ocr_spans = fill_char_in_spans(new_spans, page_all_chars)
"""对未填充的span进行ocr"""
if len(need_ocr_spans) > 0:
for span in need_ocr_spans:
# 对span的bbox截图再ocr
span_pil_img = get_crop_img(span['bbox'], pil_img, scale)
span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR)
# 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
spans.remove(span)
continue
span['content'] = ''
span['score'] = 1.0
span['np_img'] = span_img
return spans
def fill_char_in_spans(spans, all_chars):
# 简单从上到下排一下序
spans = sorted(spans, key=lambda x: x['bbox'][1])
for char in all_chars:
for span in spans:
if calculate_char_in_span(char['bbox'], span['bbox'], char['char']):
span['chars'].append(char)
break
need_ocr_spans = []
for span in spans:
chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
need_ocr_spans.append(span)
del span['height'], span['width']
return need_ocr_spans
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
Span_Height_Radio = 0.33 # 字符的中轴和span的中轴高度差不能超过1/3span高度
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
span_height = span_bbox[3] - span_bbox[1]
if (
span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
):
return True
else:
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
if char in LINE_STOP_FLAG:
if (
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
elif char in LINE_START_FLAG:
if (
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
else:
return False
def chars_to_content(span):
# 检查span中的char是否为空
if len(span['chars']) == 0:
pass
else:
# 给chars按char_idx排序
span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
# Calculate the width of each character
char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
# Calculate the median width
median_width = statistics.median(char_widths)
content = ''
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1 = char
char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['char'] != ' ' and char2['char'] != ' ':
content += f"{char['char']} "
else:
content += char['char']
content = __replace_unicode(content)
content = __replace_ligatures(content)
content = __replace_ligatures(content)
span['content'] = content.strip()
del span['chars']
def calculate_contrast(img, img_mode) -> float:
"""
计算给定图像的对比度。
:param img: 图像,类型为numpy.ndarray
:Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
:return: 图像的对比度值
"""
if img_mode == 'rgb':
# 将RGB图像转换为灰度图
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
elif img_mode == 'bgr':
# 将BGR图像转换为灰度图
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
else:
raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
# 计算均值和标准差
mean_value = np.mean(gray_img)
std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6)
# logger.debug(f"contrast: {contrast}")
return round(contrast, 2)
\ No newline at end of file
__version__ = "2.0.0"
\ No newline at end of file
......@@ -4,30 +4,22 @@ import base64
import os
import re
import time
import uuid
import zipfile
from pathlib import Path
import gradio as gr
import pymupdf
from gradio_pdf import PDF
from loguru import logger
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.hash_utils import compute_sha256
from magic_pdf.tools.common import do_parse, prepare_env
from mineru.cli.common import prepare_env, do_parse, read_fn
from mineru.utils.hash_utils import str_sha256
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True)
try:
file_name = f'{str(Path(doc_path).stem)}_{time.time()}'
file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
pdf_data = read_fn(doc_path)
if is_ocr:
parse_method = 'ocr'
......@@ -35,17 +27,14 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en
parse_method = 'auto'
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
do_parse(
output_dir,
file_name,
pdf_data,
[],
parse_method,
False,
output_dir=output_dir,
pdf_file_names=[file_name],
pdf_bytes_list=[pdf_data],
p_lang_list=[language],
parse_method=parse_method,
end_page_id=end_page_id,
layout_model=layout_mode,
formula_enable=formula_enable,
table_enable=table_enable,
lang=language,
p_formula_enable=formula_enable,
p_table_enable=table_enable,
)
return local_md_dir, file_name
except Exception as e:
......@@ -96,12 +85,11 @@ def replace_image_with_base64(markdown_text, image_dir_path):
return re.sub(pattern, replace, markdown_text)
def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
def to_markdown(file_path, end_pages, is_ocr, formula_enable, table_enable, language):
file_path = to_pdf(file_path)
# 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
layout_mode, formula_enable, table_enable, language)
archive_zip_path = os.path.join('./output', compute_sha256(local_md_dir) + '.zip')
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language)
archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0:
logger.info('压缩成功')
......@@ -125,24 +113,6 @@ latex_delimiters = [
]
def init_model():
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
try:
model_manager = ModelSingleton()
txt_model = model_manager.get_model(False, False) # noqa: F841
logger.info('txt_model init final')
ocr_model = model_manager.get_model(True, False) # noqa: F841
logger.info('ocr_model init final')
return 0
except Exception as e:
logger.exception(e)
return -1
model_init = init_model()
logger.info(f'model_init: {model_init}')
with open('header.html', 'r') as file:
header = file.read()
......@@ -171,24 +141,30 @@ all_lang = []
all_lang.extend([*other_lang, *add_lang])
def safe_stem(file_path):
stem = Path(file_path).stem
# 只保留字母、数字、下划线和点,其他字符替换为下划线
return re.sub(r'[^\w.]', '_', stem)
def to_pdf(file_path):
with pymupdf.open(file_path) as f:
if f.is_pdf:
return file_path
else:
pdf_bytes = f.convert_to_pdf()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename = f'{uuid.uuid4()}.pdf'
# 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
if file_path is None:
return None
pdf_bytes = read_fn(file_path)
# 将字节数据写入文件
with open(tmp_file_path, 'wb') as tmp_pdf_file:
tmp_pdf_file.write(pdf_bytes)
# unique_filename = f'{uuid.uuid4()}.pdf'
unique_filename = f'{safe_stem(file_path)}.pdf'
return tmp_file_path
# 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
# 将字节数据写入文件
with open(tmp_file_path, 'wb') as tmp_pdf_file:
tmp_pdf_file.write(pdf_bytes)
return tmp_file_path
if __name__ == '__main__':
......@@ -196,14 +172,16 @@ if __name__ == '__main__':
gr.HTML(header)
with gr.Row():
with gr.Column(variant='panel', scale=5):
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Row():
layout_mode = gr.Dropdown(['doclayout_yolo'], label='Layout model', value='doclayout_yolo')
language = gr.Dropdown(all_lang, label='Language', value='ch')
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
with gr.Row(equal_height=True):
with gr.Column(scale=4):
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Column(scale=1):
language = gr.Dropdown(all_lang, label='Language', value='ch')
with gr.Row():
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
with gr.Row():
change_bu = gr.Button('Convert')
......@@ -227,7 +205,7 @@ if __name__ == '__main__':
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language],
outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
......
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "mineru"
dynamic = ["version"]
license = {text = "AGPL-3.0"}
description = "A practical tool for converting PDF to Markdown"
readme = "README.md"
requires-python = ">=3.10,<3.14"
keywords = ["magic-pdf", "mineru", "MinerU", "convert", "pdf", "markdown"]
classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dependencies = [
"boto3>=1.28.43",
"click>=8.1.7",
"loguru>=0.7.2",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"tqdm>=4.67.1",
"requests",
"httpx",
"pillow>=11.0.0",
"pypdfium2>=4.30.0",
"pypdf>=5.6.0",
"reportlab",
"pdftext>=0.6.2",
"modelscope>=1.26.0",
"huggingface-hub>=0.32.4",
"json-repair>=0.46.2",
]
[project.optional-dependencies]
vlm = [
"transformers>=4.51.1",
"torch>=2.6.0",
"accelerate>=1.5.1",
"pydantic",
]
sglang = [
"sglang[all]>=0.4.7",
]
pipeline = [
"matplotlib>=3.10,<4",
"ultralytics>=8.3.48,<9",
"doclayout_yolo==0.0.4",
"dill>=0.3.8,<1",
"rapid_table>=1.0.5,<2.0.0",
"PyYAML>=6.0.2,<7",
"ftfy>=6.3.1,<7",
"openai>=1.70.0,<2",
"shapely>=2.0.7,<3",
"pyclipper>=1.3.0,<2",
"omegaconf>=2.3.0,<3",
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
core = [
"mineru[vlm]",
"mineru[pipeline]",
]
all = [
"mineru[core]",
"mineru[sglang]",
]
pipeline_old_linux = [
"matplotlib>=3.10,<=3.10.1",
"ultralytics>=8.3.48,<=8.3.104",
"doclayout_yolo==0.0.4",
"dill==0.3.8",
"PyYAML==6.0.2",
"ftfy==6.3.1",
"openai==1.71.0",
"shapely==2.1.0",
"pyclipper==1.3.0.post6",
"omegaconf==2.3.0",
"albumentations==1.4.20",
"rapid_table==1.0.3",
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
[project.urls]
Home = "https://mineru.net/"
Repository = "https://github.com/opendatalab/MinerU"
[project.scripts]
mineru = "mineru.cli:client.main"
mineru-sglang-server = "mineru.cli.vlm_sglang_server:main"
mineru-models-download = "mineru.cli.models_download:download_models"
[tool.setuptools.dynamic]
version = {attr = "mineru.version.__version__"}
[tool.setuptools.packages.find]
include = ["mineru*"]
namespaces = false
[tool.setuptools.package-data]
"mineru" = ["resources/**"]
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils" = ["resources/**"]
[tool.setuptools]
include-package-data = true
zip-safe = false
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0
numpy>=1.21.6
pydantic>=2.7.2,<2.11
PyMuPDF>=1.24.9,<1.25.0
scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1,<3
torchvision
transformers>=4.49.0,!=4.51.0,<5.0.0
pdfminer.six==20250506
tqdm>=4.67.1
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment