Unverified Commit 0c7a0882 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2611 from myhloli/dev

Dev
parents 3bd0ecf1 a392f445
import re
from typing import Literal
from mineru.utils.boxbase import bbox_distance, is_in
from mineru.utils.enum_class import ContentType, BlockType, SplitFlag
from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
from mineru.utils.format_utils import convert_otsl_to_html
class MagicModel:
def __init__(self, token: str, width, height):
self.token = token
# 使用正则表达式查找所有块
pattern = (
r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
)
block_infos = re.findall(pattern, token, re.DOTALL)
blocks = []
self.all_spans = []
# 解析每个块
for index, block_info in enumerate(block_infos):
block_bbox = block_info[0].strip()
x1, y1, x2, y2 = map(int, block_bbox.split())
x_1, y_1, x_2, y_2 = (
int(x1 * width / 1000),
int(y1 * height / 1000),
int(x2 * width / 1000),
int(y2 * height / 1000),
)
if x_2 < x_1:
x_1, x_2 = x_2, x_1
if y_2 < y_1:
y_1, y_2 = y_2, y_1
block_bbox = (x_1, y_1, x_2, y_2)
block_type = block_info[1].strip()
block_content = block_info[2].strip()
# print(f"坐标: {block_bbox}")
# print(f"类型: {block_type}")
# print(f"内容: {block_content}")
# print("-" * 50)
span_type = "unknown"
if block_type in [
"text",
"title",
"image_caption",
"image_footnote",
"table_caption",
"table_footnote",
"list",
"index",
]:
span_type = ContentType.TEXT
elif block_type in ["image"]:
block_type = BlockType.IMAGE_BODY
span_type = ContentType.IMAGE
elif block_type in ["table"]:
block_type = BlockType.TABLE_BODY
span_type = ContentType.TABLE
elif block_type in ["equation"]:
block_type = BlockType.INTERLINE_EQUATION
span_type = ContentType.INTERLINE_EQUATION
if span_type in ["image", "table"]:
span = {
"bbox": block_bbox,
"type": span_type,
}
if span_type == ContentType.TABLE:
if "<fcel>" in block_content or "<ecel>" in block_content:
lines = block_content.split("\n\n")
new_lines = []
for line in lines:
if "<fcel>" in line or "<ecel>" in line:
line = convert_otsl_to_html(line)
new_lines.append(line)
span["html"] = "\n\n".join(new_lines)
else:
span["html"] = block_content
elif span_type in [ContentType.INTERLINE_EQUATION]:
span = {
"bbox": block_bbox,
"type": span_type,
"content": isolated_formula_clean(block_content),
}
else:
if block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
# 生成包含文本和公式的span列表
spans = []
last_end = 0
# 查找所有公式
for match in re.finditer(r'\\\((.+?)\\\)', block_content):
start, end = match.span()
# 添加公式前的文本
if start > last_end:
text_before = block_content[last_end:start]
if text_before.strip():
spans.append({
"bbox": block_bbox,
"type": ContentType.TEXT,
"content": text_before
})
# 添加公式(去除\(和\))
formula = match.group(1)
spans.append({
"bbox": block_bbox,
"type": ContentType.INLINE_EQUATION,
"content": formula.strip()
})
last_end = end
# 添加最后一个公式后的文本
if last_end < len(block_content):
text_after = block_content[last_end:]
if text_after.strip():
spans.append({
"bbox": block_bbox,
"type": ContentType.TEXT,
"content": text_after
})
span = spans
else:
span = {
"bbox": block_bbox,
"type": span_type,
"content": block_content,
}
if isinstance(span, dict) and "bbox" in span:
self.all_spans.append(span)
line = {
"bbox": block_bbox,
"spans": [span],
}
elif isinstance(span, list):
self.all_spans.extend(span)
line = {
"bbox": block_bbox,
"spans": span,
}
else:
raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
blocks.append(
{
"bbox": block_bbox,
"type": block_type,
"lines": [line],
"index": index,
}
)
self.image_blocks = []
self.table_blocks = []
self.interline_equation_blocks = []
self.text_blocks = []
self.title_blocks = []
for block in blocks:
if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
self.image_blocks.append(block)
elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
self.table_blocks.append(block)
elif block["type"] == BlockType.INTERLINE_EQUATION:
self.interline_equation_blocks.append(block)
elif block["type"] == BlockType.TEXT:
self.text_blocks.append(block)
elif block["type"] == BlockType.TITLE:
self.title_blocks.append(block)
else:
continue
def get_image_blocks(self):
return fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
def get_table_blocks(self):
return fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
def get_title_blocks(self):
return fix_title_blocks(self.title_blocks)
def get_text_blocks(self):
return fix_text_blocks(self.text_blocks)
def get_interline_equation_blocks(self):
return self.interline_equation_blocks
def get_all_spans(self):
return self.all_spans
def isolated_formula_clean(txt):
latex = txt[:]
if latex.startswith("\\["): latex = latex[2:]
if latex.endswith("\\]"): latex = latex[:-2]
latex = latex_fix(latex.strip())
return latex
def latex_fix(latex):
# 白名单分隔符
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
r'\Uparrow', r'\Downarrow', r'\|', r'\.']
# 为\left后缺失有效分隔符的情况添加点
def fix_delim(match):
cmd = match.group(1) # \left 或 \right
rest = match.group(2) if len(match.groups()) > 1 else ""
if not rest or rest not in valid_delims_list:
return cmd + "."
return match.group(0)
LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
latex = LEFT_PATTERN.sub(lambda m: fix_delim(m), latex)
latex = RIGHT_PATTERN.sub(lambda m: fix_delim(m), latex)
left_count = len(LEFT_COUNT_PATTERN.findall(latex)) # 不匹配\lefteqn等
right_count = len(RIGHT_COUNT_PATTERN.findall(latex)) # 不匹配\rightarrow
if left_count != right_count:
return LEFT_RIGHT_REMOVE_PATTERN.sub('', latex)
return latex
def __reduct_overlap(bboxes):
N = len(bboxes)
keep = [True] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]):
keep[i] = False
return [bboxes[i] for i in range(N) if keep[i]]
def __tie_up_category_by_distance_v3(
blocks: list,
subject_block_type: str,
object_block_type: str,
):
subjects = __reduct_overlap(
list(
map(
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
filter(
lambda x: x["type"] == subject_block_type,
blocks,
),
)
)
)
objects = __reduct_overlap(
list(
map(
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
filter(
lambda x: x["type"] == object_block_type,
blocks,
),
)
)
)
ret = []
N, M = len(subjects), len(objects)
subjects.sort(key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2)
objects.sort(key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2)
OBJ_IDX_OFFSET = 10000
SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1
all_boxes_with_idx = [(i, SUB_BIT_KIND, sub["bbox"][0], sub["bbox"][1]) for i, sub in enumerate(subjects)] + [
(i + OBJ_IDX_OFFSET, OBJ_BIT_KIND, obj["bbox"][0], obj["bbox"][1]) for i, obj in enumerate(objects)
]
seen_idx = set()
seen_sub_idx = set()
while N > len(seen_sub_idx):
candidates = []
for idx, kind, x0, y0 in all_boxes_with_idx:
if idx in seen_idx:
continue
candidates.append((idx, kind, x0, y0))
if len(candidates) == 0:
break
left_x = min([v[2] for v in candidates])
top_y = min([v[3] for v in candidates])
candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y) ** 2)
fst_idx, fst_kind, left_x, top_y = candidates[0]
candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y) ** 2)
nxt = None
for i in range(1, len(candidates)):
if candidates[i][1] ^ fst_kind == 1:
nxt = candidates[i]
break
if nxt is None:
break
if fst_kind == SUB_BIT_KIND:
sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET
else:
sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET
pair_dis = bbox_distance(subjects[sub_idx]["bbox"], objects[obj_idx]["bbox"])
nearest_dis = float("inf")
for i in range(N):
if i in seen_idx or i == sub_idx:
continue
nearest_dis = min(nearest_dis, bbox_distance(subjects[i]["bbox"], objects[obj_idx]["bbox"]))
if pair_dis >= 3 * nearest_dis:
seen_idx.add(sub_idx)
continue
seen_idx.add(sub_idx)
seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
seen_sub_idx.add(sub_idx)
ret.append(
{
"sub_bbox": {
"bbox": subjects[sub_idx]["bbox"],
"lines": subjects[sub_idx]["lines"],
"index": subjects[sub_idx]["index"],
},
"obj_bboxes": [
{"bbox": objects[obj_idx]["bbox"], "lines": objects[obj_idx]["lines"], "index": objects[obj_idx]["index"]}
],
"sub_idx": sub_idx,
}
)
for i in range(len(objects)):
j = i + OBJ_IDX_OFFSET
if j in seen_idx:
continue
seen_idx.add(j)
nearest_dis, nearest_sub_idx = float("inf"), -1
for k in range(len(subjects)):
dis = bbox_distance(objects[i]["bbox"], subjects[k]["bbox"])
if dis < nearest_dis:
nearest_dis = dis
nearest_sub_idx = k
for k in range(len(subjects)):
if k != nearest_sub_idx:
continue
if k in seen_sub_idx:
for kk in range(len(ret)):
if ret[kk]["sub_idx"] == k:
ret[kk]["obj_bboxes"].append(
{"bbox": objects[i]["bbox"], "lines": objects[i]["lines"], "index": objects[i]["index"]}
)
break
else:
ret.append(
{
"sub_bbox": {
"bbox": subjects[k]["bbox"],
"lines": subjects[k]["lines"],
"index": subjects[k]["index"],
},
"obj_bboxes": [
{"bbox": objects[i]["bbox"], "lines": objects[i]["lines"], "index": objects[i]["index"]}
],
"sub_idx": k,
}
)
seen_sub_idx.add(k)
seen_idx.add(k)
for i in range(len(subjects)):
if i in seen_sub_idx:
continue
ret.append(
{
"sub_bbox": {
"bbox": subjects[i]["bbox"],
"lines": subjects[i]["lines"],
"index": subjects[i]["index"],
},
"obj_bboxes": [],
"sub_idx": i,
}
)
return ret
def get_type_blocks(blocks, block_type: Literal["image", "table"]):
with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
ret = []
for v in with_captions:
record = {
f"{block_type}_body": v["sub_bbox"],
f"{block_type}_caption_list": v["obj_bboxes"],
}
filter_idx = v["sub_idx"]
d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes))
record[f"{block_type}_footnote_list"] = d["obj_bboxes"]
ret.append(record)
return ret
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
need_fix_blocks = get_type_blocks(blocks, fix_type)
fixed_blocks = []
for block in need_fix_blocks:
body = block[f"{fix_type}_body"]
caption_list = block[f"{fix_type}_caption_list"]
footnote_list = block[f"{fix_type}_footnote_list"]
body["type"] = f"{fix_type}_body"
for caption in caption_list:
caption["type"] = f"{fix_type}_caption"
for footnote in footnote_list:
footnote["type"] = f"{fix_type}_footnote"
two_layer_block = {
"type": fix_type,
"bbox": body["bbox"],
"blocks": [
body,
],
"index": body["index"],
}
two_layer_block["blocks"].extend([*caption_list, *footnote_list])
fixed_blocks.append(two_layer_block)
return fixed_blocks
def fix_title_blocks(blocks):
for block in blocks:
if block["type"] == BlockType.TITLE:
title_content = merge_para_with_text(block)
title_level = count_leading_hashes(title_content)
block['level'] = title_level
for line in block['lines']:
for span in line['spans']:
span['content'] = strip_leading_hashes(span['content'])
break
break
return blocks
def count_leading_hashes(text):
match = re.match(r'^(#+)', text)
return len(match.group(1)) if match else 0
def strip_leading_hashes(text):
# 去除开头的#和紧随其后的空格
return re.sub(r'^#+\s*', '', text)
def fix_text_blocks(blocks):
i = 0
while i < len(blocks):
block = blocks[i]
last_line = block["lines"][-1]if block["lines"] else None
if last_line:
last_span = last_line["spans"][-1] if last_line["spans"] else None
if last_span and last_span['content'].endswith('<|txt_contd|>'):
last_span['content'] = last_span['content'][:-len('<|txt_contd|>')]
# 查找下一个未被清空的块
next_idx = i + 1
while next_idx < len(blocks) and blocks[next_idx].get(SplitFlag.LINES_DELETED, False):
next_idx += 1
# 如果找到下一个有效块,则合并
if next_idx < len(blocks):
next_block = blocks[next_idx]
# 将下一个块的lines扩展到当前块的lines中
block["lines"].extend(next_block["lines"])
# 清空下一个块的lines
next_block["lines"] = []
# 在下一个块中添加标志
next_block[SplitFlag.LINES_DELETED] = True
# 不增加i,继续检查当前块(现在已包含下一个块的内容)
continue
i += 1
return blocks
\ No newline at end of file
from mineru.utils.config_reader import get_latex_delimiter_config
from mineru.utils.enum_class import MakeMode, BlockType, ContentType
latex_delimiters_config = get_latex_delimiter_config()
default_delimiters = {
'display': {'left': '$$', 'right': '$$'},
'inline': {'left': '$', 'right': '$'}
}
delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
display_left_delimiter = delimiters['display']['left']
display_right_delimiter = delimiters['display']['right']
inline_left_delimiter = delimiters['inline']['left']
inline_right_delimiter = delimiters['inline']['right']
def merge_para_with_text(para_block):
para_text = ''
for line in para_block['lines']:
for j, span in enumerate(line['spans']):
span_type = span['type']
content = ''
if span_type == ContentType.TEXT:
content = span['content']
elif span_type == ContentType.INLINE_EQUATION:
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.INTERLINE_EQUATION:
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
# content = content.strip()
if content:
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
if j == len(line['spans']) - 1:
para_text += content
else:
para_text += f'{content} '
elif span_type == ContentType.INTERLINE_EQUATION:
para_text += content
return para_text
def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path=''):
page_markdown = []
for para_block in para_blocks:
para_text = ''
para_type = para_block['type']
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
elif para_type == BlockType.IMAGE:
if make_mode == MakeMode.NLP_MD:
continue
elif make_mode == MakeMode.MM_MD:
# 检测是否存在图片脚注
has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
if has_image_footnote:
for block in para_block['blocks']: # 1st.拼image_caption
if block['type'] == BlockType.IMAGE_CAPTION:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼image_body
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼image_footnote
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_text += ' \n' + merge_para_with_text(block)
else:
for block in para_block['blocks']: # 1st.拼image_body
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.IMAGE_CAPTION:
para_text += ' \n' + merge_para_with_text(block)
elif para_type == BlockType.TABLE:
if make_mode == MakeMode.NLP_MD:
continue
elif make_mode == MakeMode.MM_MD:
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TABLE_CAPTION:
para_text += merge_para_with_text(block) + ' \n'
for block in para_block['blocks']: # 2nd.拼table_body
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.TABLE:
# if processed by table model
if span.get('html', ''):
para_text += f"\n{span['html']}\n"
elif span.get('image_path', ''):
para_text += f"![]({img_buket_path}/{span['image_path']})"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_text += '\n' + merge_para_with_text(block) + ' '
if para_text.strip() == '':
continue
else:
# page_markdown.append(para_text.strip() + ' ')
page_markdown.append(para_text.strip())
return page_markdown
def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_type = para_block['type']
para_content = {}
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
}
elif para_type == BlockType.TITLE:
title_level = get_title_level(para_block)
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
}
if title_level != 0:
para_content['text_level'] = title_level
elif para_type == BlockType.INTERLINE_EQUATION:
para_content = {
'type': 'equation',
'text': merge_para_with_text(para_block),
'text_format': 'latex',
}
elif para_type == BlockType.IMAGE:
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.IMAGE:
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.IMAGE_CAPTION:
para_content['img_caption'].append(merge_para_with_text(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_content['img_footnote'].append(merge_para_with_text(block))
elif para_type == BlockType.TABLE:
para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.TABLE:
if span.get('html', ''):
para_content['table_body'] = f"{span['html']}"
if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.TABLE_CAPTION:
para_content['table_caption'].append(merge_para_with_text(block))
if block['type'] == BlockType.TABLE_FOOTNOTE:
para_content['table_footnote'].append(merge_para_with_text(block))
para_content['page_idx'] = page_idx
return para_content
def union_make(pdf_info_dict: list,
make_mode: str,
img_buket_path: str = '',
):
output_content = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
page_idx = page_info.get('page_idx')
if not paras_of_layout:
continue
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.CONTENT_LIST:
for para_block in paras_of_layout:
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content)
elif make_mode == MakeMode.CONTENT_LIST:
return output_content
return None
def get_title_level(block):
title_level = block.get('level', 1)
if title_level > 4:
title_level = 4
elif title_level < 1:
title_level = 0
return title_level
# Copyright (c) Opendatalab. All rights reserved.
import os
import click
from pathlib import Path
import torch
from loguru import logger
from mineru.utils.model_utils import get_vram
from ..version import __version__
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'input_path',
type=click.Path(exists=True),
required=True,
help='local filepath or directory. support pdf, png, jpg, jpeg files',
)
@click.option(
'-o',
'--output',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=click.Choice(['auto', 'txt', 'ocr']),
help="""the method for parsing pdf:
auto: Automatically determine the method based on the file type.
txt: Use text extraction method.
ocr: Use OCR method for image-based PDFs.
Without method specified, 'auto' will be used by default.""",
default='auto',
)
@click.option(
'-b',
'--backend',
'backend',
type=click.Choice(['pipeline', 'vlm-huggingface', 'vlm-sglang-engine', 'vlm-sglang-client']),
help="""the backend for parsing pdf:
pipeline: More general.
vlm-huggingface: More general.
vlm-sglang-engine: Faster(engine).
vlm-sglang-client: Faster(client).
without method specified, pipeline will be used by default.""",
default='pipeline',
)
@click.option(
'-l',
'--lang',
'lang',
type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']),
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Without languages specified, 'ch' will be used by default.
Adapted only for the case where the backend is set to "pipeline".
""",
default='ch',
)
@click.option(
'-u',
'--url',
'server_url',
type=str,
help="""
When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
""",
default=None,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
@click.option(
'-f',
'--formula',
'formula_enable',
type=bool,
help='Enable formula parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".',
default=True,
)
@click.option(
'-t',
'--table',
'table_enable',
type=bool,
help='Enable table parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".',
default=True,
)
@click.option(
'-d',
'--device',
'device_mode',
type=str,
help='Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps". Adapted only for the case where the backend is set to "pipeline". ',
default=None,
)
@click.option(
'--vram',
'virtual_vram',
type=int,
help='Upper limit of GPU memory occupied by a single process. Adapted only for the case where the backend is set to "pipeline". ',
default=None,
)
@click.option(
'--source',
'model_source',
type=click.Choice(['huggingface', 'modelscope', 'local']),
help="""
The source of the model repository. Default is 'huggingface'.
""",
default='huggingface',
)
def main(input_path, output_dir, method, backend, lang, server_url, start_page_id, end_page_id, formula_enable, table_enable, device_mode, virtual_vram, model_source):
if os.getenv('MINERU_FORMULA_ENABLE', None) is None:
os.environ['MINERU_FORMULA_ENABLE'] = str(formula_enable).lower()
if os.getenv('MINERU_TABLE_ENABLE', None) is None:
os.environ['MINERU_TABLE_ENABLE'] = str(table_enable).lower()
def get_device_mode() -> str:
if device_mode is not None:
return device_mode
if torch.cuda.is_available():
return "cuda"
if torch.backends.mps.is_available():
return "mps"
return "cpu"
if os.getenv('MINERU_DEVICE_MODE', None) is None:
os.environ['MINERU_DEVICE_MODE'] = get_device_mode()
def get_virtual_vram_size() -> int:
if virtual_vram is not None:
return virtual_vram
if get_device_mode().startswith("cuda") or get_device_mode().startswith("npu"):
return round(get_vram(get_device_mode()))
return 1
if os.getenv('MINERU_VIRTUAL_VRAM_SIZE', None) is None:
os.environ['MINERU_VIRTUAL_VRAM_SIZE']= str(get_virtual_vram_size())
if os.getenv('MINERU_MODEL_SOURCE', None) is None:
os.environ['MINERU_MODEL_SOURCE'] = model_source
os.makedirs(output_dir, exist_ok=True)
def parse_doc(path_list: list[Path]):
try:
file_name_list = []
pdf_bytes_list = []
lang_list = []
for path in path_list:
file_name = str(Path(path).stem)
pdf_bytes = read_fn(path)
file_name_list.append(file_name)
pdf_bytes_list.append(pdf_bytes)
lang_list.append(lang)
do_parse(
output_dir=output_dir,
pdf_file_names=file_name_list,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=lang_list,
backend=backend,
parse_method=method,
server_url=server_url,
start_page_id=start_page_id,
end_page_id=end_page_id
)
except Exception as e:
logger.exception(e)
if os.path.isdir(input_path):
doc_path_list = []
for doc_path in Path(input_path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes:
doc_path_list.append(doc_path)
parse_doc(doc_path_list)
else:
parse_doc([Path(input_path)])
if __name__ == '__main__':
main()
# Copyright (c) Opendatalab. All rights reserved.
import io
import json
import os
import copy
from pathlib import Path
import pypdfium2 as pdfium
from loguru import logger
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
def read_fn(path):
if not isinstance(path, Path):
path = Path(path)
with open(str(path), "rb") as input_file:
file_bytes = input_file.read()
if path.suffix in image_suffixes:
return images_bytes_to_pdf_bytes(file_bytes)
elif path.suffix in pdf_suffixes:
return file_bytes
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
def prepare_env(output_dir, pdf_file_name, parse_method):
local_md_dir = str(os.path.join(output_dir, pdf_file_name, parse_method))
local_image_dir = os.path.join(str(local_md_dir), "images")
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
# 从字节数据加载PDF
pdf = pdfium.PdfDocument(pdf_bytes)
# 确定结束页
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
if end_page_id > len(pdf) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length")
end_page_id = len(pdf) - 1
# 创建一个新的PDF文档
output_pdf = pdfium.PdfDocument.new()
# 选择要导入的页面索引
page_indices = list(range(start_page_id, end_page_id + 1))
# 从原PDF导入页面到新PDF
output_pdf.import_pages(pdf, page_indices)
# 将新PDF保存到内存缓冲区
output_buffer = io.BytesIO()
output_pdf.save(output_buffer)
# 获取字节数据
output_bytes = output_buffer.getvalue()
pdf.close() # 关闭原PDF文档以释放资源
output_pdf.close() # 关闭新PDF文档以释放资源
return output_bytes
def do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
p_formula_enable=True,
p_table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
):
if backend == "pipeline":
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes_list[idx] = new_pdf_bytes
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
else:
if backend.startswith("vlm-"):
backend = backend[4:]
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
model_path = auto_download_and_get_model_root_path('/', 'vlm')
middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, model_path=model_path, server_url=server_url)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
logger.info(f"local output dir is {local_md_dir}")
if __name__ == "__main__":
# pdf_path = "../../demo/pdfs/demo3.pdf"
pdf_path = "C:/Users/zhaoxiaomeng/Downloads/4546d0e2-ba60-40a5-a17e-b68555cec741.pdf"
try:
do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"],
end_page_id=10,
backend='vlm-huggingface'
# backend = 'pipeline'
)
except Exception as e:
logger.exception(e)
import json
import os
import sys
import click
import requests
from mineru.utils.enum_class import ModelPath
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
def download_json(url):
"""下载JSON文件"""
response = requests.get(url)
response.raise_for_status()
return response.json()
def download_and_modify_json(url, local_filename, modifications):
"""下载JSON并修改内容"""
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.3.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
if key in data:
if isinstance(data[key], dict):
# 如果是字典,合并新值
data[key].update(value)
else:
# 否则直接替换
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def configure_model(model_dir, model_type):
"""配置模型"""
# json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json'
json_url = 'https://gcore.jsdelivr.net/gh/myhloli/Magic-PDF@dev/mineru.template.json'
config_file_name = 'mineru.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': {
f'{model_type}': model_dir
}
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been successfully configured, the path is: {config_file}')
@click.command()
@click.option(
'-s',
'--source',
'model_source',
type=click.Choice(['huggingface', 'modelscope']),
help="""
The source of the model repository.
""",
default=None,
)
@click.option(
'-m',
'--model_type',
'model_type',
type=click.Choice(['pipeline', 'vlm', 'all']),
help="""
The type of the model to download.
""",
default=None,
)
def download_models(model_source, model_type):
"""Download MinerU model files.
Supports downloading pipeline or VLM models from ModelScope or HuggingFace.
"""
# 如果未显式指定则交互式输入下载来源
if model_source is None:
model_source = click.prompt(
"Please select the model download source: ",
type=click.Choice(['huggingface', 'modelscope']),
default='huggingface'
)
if os.getenv('MINERU_MODEL_SOURCE', None) is None:
os.environ['MINERU_MODEL_SOURCE'] = model_source
# 如果未显式指定则交互式输入模型类型
if model_type is None:
model_type = click.prompt(
"Please select the model type to download: ",
type=click.Choice(['pipeline', 'vlm', 'all']),
default='all'
)
click.echo(f"Downloading {model_type} model from {os.getenv('MINERU_MODEL_SOURCE', None)}...")
def download_pipeline_models():
"""下载Pipeline模型"""
model_paths = [
ModelPath.doclayout_yolo,
ModelPath.yolo_v8_mfd,
ModelPath.unimernet_small,
ModelPath.pytorch_paddle,
ModelPath.layout_reader,
ModelPath.slanet_plus
]
download_finish_path = ""
for model_path in model_paths:
click.echo(f"Downloading model: {model_path}")
download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline')
click.echo(f"Pipeline models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, model_type)
def download_vlm_models():
"""下载VLM模型"""
download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm')
click.echo(f"VLM models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, model_type)
try:
if model_type == 'pipeline':
download_pipeline_models()
elif model_type == 'vlm':
download_vlm_models()
elif model_type == 'all':
download_pipeline_models()
download_vlm_models()
else:
click.echo(f"Unsupported model type: {model_type}", err=True)
sys.exit(1)
except Exception as e:
click.echo(f"Download failed: {str(e)}", err=True)
sys.exit(1)
if __name__ == '__main__':
download_models()
from ..model.vlm_sglang_model.server import main
if __name__ == "__main__":
main()
# Copyright (c) Opendatalab. All rights reserved.
from .base import DataReader, DataWriter
from .dummy import DummyDataWriter
from .filebase import FileBasedDataReader, FileBasedDataWriter
from .multi_bucket_s3 import MultiBucketS3DataReader, MultiBucketS3DataWriter
from .s3 import S3DataReader, S3DataWriter
__all__ = [
"DataReader",
"DataWriter",
"FileBasedDataReader",
"FileBasedDataWriter",
"S3DataReader",
"S3DataWriter",
"MultiBucketS3DataReader",
"MultiBucketS3DataWriter",
"DummyDataWriter",
]
from .base import DataWriter
class DummyDataWriter(DataWriter):
def write(self, path: str, data: bytes) -> None:
"""Dummy write method that does nothing."""
pass
def write_string(self, path: str, data: str) -> None:
"""Dummy write_string method that does nothing."""
pass
import os
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from .base import DataReader, DataWriter
class FileBasedDataReader(DataReader):
......
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer
from magic_pdf.data.schemas import S3Config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
from ..utils.exceptions import InvalidConfig, InvalidParams
from .base import DataReader, DataWriter
from ..io.s3 import S3Reader, S3Writer
from ..utils.schemas import S3Config
from ..utils.path_utils import parse_s3_range_params, parse_s3path, remove_non_official_s3_args
class MultiS3Mixin:
......
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import (
MultiBucketS3DataReader, MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
from .multi_bucket_s3 import MultiBucketS3DataReader, MultiBucketS3DataWriter
from ..utils.schemas import S3Config
class S3DataReader(MultiBucketS3DataReader):
......
from .base import IOReader, IOWriter
from .http import HttpReader, HttpWriter
from .s3 import S3Reader, S3Writer
__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
......@@ -3,7 +3,7 @@ import io
import requests
from magic_pdf.data.io.base import IOReader, IOWriter
from .base import IOReader, IOWriter
class HttpReader(IOReader):
......
import boto3
from botocore.config import Config
from magic_pdf.data.io.base import IOReader, IOWriter
from ..io.base import IOReader, IOWriter
class S3Reader(IOReader):
......
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
class FileNotExisted(Exception):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment