Unverified Commit 6ab12348 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2625 from opendatalab/release-2.0.0

Release 2.0.0
parents 9487d33d 4fbec469
def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
'page_idx': page_id,
'page_size': [page_w, page_h],
'_layout_tree': layout_tree,
'images': images,
'tables': tables,
'interline_equations': interline_equations,
'discarded_blocks': discarded_blocks,
'need_drop': need_drop,
'drop_reason': drop_reason,
}
return return_dict
from loguru import logger
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.pdf_image_tools import cut_image
def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
if not check_img_bbox(span['bbox']) or not imageWriter:
continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
imageWriter=imageWriter)
elif span_type == ContentType.Table:
if not check_img_bbox(span['bbox']) or not imageWriter:
continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter)
return spans
def check_img_bbox(bbox) -> bool:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f'image_bboxes: 错误的box, {bbox}')
return False
return True
from magic_pdf.config.drop_tag import DropTag
from magic_pdf.config.ocr_content_type import BlockType
from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
def remove_overlaps_low_confidence_spans(spans):
dropped_spans = []
# 删除重叠spans中置信度低的的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span1['score'] < span2['score']:
span_need_remove = span1
else:
span_need_remove = span2
if (
span_need_remove is not None
and span_need_remove not in dropped_spans
):
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def check_chars_is_overlap_in_span(chars):
for i in range(len(chars)):
for j in range(i + 1, len(chars)):
if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.35:
return True
return False
def remove_x_overlapping_chars(span, median_width):
"""
Remove characters from a span that overlap significantly on the x-axis.
Args:
median_width:
span (dict): A span containing a list of chars, each with bbox coordinates
in the format [x0, y0, x1, y1]
Returns:
dict: The span with overlapping characters removed
"""
if 'chars' not in span or len(span['chars']) < 2:
return span
overlap_threshold = median_width * 0.3
i = 0
while i < len(span['chars']) - 1:
char1 = span['chars'][i]
char2 = span['chars'][i + 1]
# Calculate overlap width
x_left = max(char1['bbox'][0], char2['bbox'][0])
x_right = min(char1['bbox'][2], char2['bbox'][2])
if x_right > x_left: # There is overlap
overlap_width = x_right - x_left
if overlap_width > overlap_threshold:
if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
# Determine which character to remove
width1 = char1['bbox'][2] - char1['bbox'][0]
width2 = char2['bbox'][2] - char2['bbox'][0]
if width1 < width2:
# Remove the narrower character
span['chars'].pop(i)
else:
span['chars'].pop(i + 1)
else:
i += 1
# Don't increment i since we need to check the new pair
else:
i += 1
else:
i += 1
return span
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def get_qa_need_list_v2(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
for block in blocks:
if block['type'] == BlockType.Image:
images.append(block)
elif block['type'] == BlockType.Table:
tables.append(block)
elif block['type'] == BlockType.InterlineEquation:
interline_equations.append(block)
return images, tables, interline_equations
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
def _remove_overlap_between_bbox(bbox1, bbox2):
if _is_part_overlap(bbox1, bbox2):
ix0, iy0, ix1, iy1 = bbox1
x0, y0, x1, y1 = bbox2
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid - 0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid - 0.25)
iy0 = max(mid + 0.25, iy0)
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
bbox1 = [ix0, iy0, ix1, iy1]
bbox2 = [x0, y0, x1, y1]
return bbox1, bbox2, None
else:
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
else:
return bbox1, bbox2, None
def _remove_overlap_between_bboxes(arr):
drop_reasons = []
N = len(arr)
keeps = [True] * N
res = [None] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if _is_in(arr[i]['bbox'], arr[j]['bbox']):
keeps[i] = False
for idx, v in enumerate(arr):
if not keeps[idx]:
continue
for i in range(N):
if res[i] is None:
continue
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
v['bbox'], res[i]['bbox']
)
if drop_reason is None:
v['bbox'] = bbox1
res[i]['bbox'] = bbox2
else:
if v['score'] > res[i]['score']:
keeps[i] = False
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reason)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
def remove_overlap_between_bbox_for_span(spans):
arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
spans[i]['bbox'] = res[i]['bbox']
ret.append(spans[i])
return ret, drop_reasons
def remove_overlap_between_bbox_for_block(all_bboxes):
arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
all_bboxes[i][:4] = res[i]['bbox']
ret.append(all_bboxes[i])
return ret, drop_reasons
weights:
layoutlmv3: Layout/LayoutLMv3/model_final.pth
doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
unimernet_small: MFR/unimernet_hf_small_2503
struct_eqtable: TabRec/StructEqTable
tablemaster: TabRec/TableMaster
rapid_table: TabRec/RapidTable
\ No newline at end of file
from loguru import logger
from magic_pdf.config.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get('data_source')
if data_source is None:
data_source = jso.get('file_source')
return data_source
def get_data_type(jso: dict):
data_type = jso.get('data_type')
if data_type is None:
data_type = jso.get('file_type')
return data_type
def get_bookid(jso: dict):
book_id = jso.get('bookid')
if book_id is None:
book_id = jso.get('original_file_id')
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso['_need_drop'] = True
jso['_drop_reason'] = DropReason.Exception
jso['_exception'] = f'ERROR: {e}'
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = f'{data_source}/{file_id}'
return book_name
def spark_json_extractor(jso: dict) -> dict:
"""从json中提取数据,返回一个dict."""
return {
'_pdf_type': jso['_pdf_type'],
'model_list': jso['doc_layout_result'],
}
import os
import shutil
import tempfile
from pathlib import Path
import click
import fitz
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-l',
'--lang',
'lang',
type=str,
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f'Unknown file suffix: {path.suffix}')
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
try:
file_name = str(Path(doc_path).stem)
if dataset is None:
pdf_data_or_dataset = read_fn(doc_path)
else:
pdf_data_or_dataset = dataset
do_parse(
output_dir,
file_name,
pdf_data_or_dataset,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
doc_paths = []
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
if doc_path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(doc_path), temp_dir)
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
elif doc_path.suffix in image_suffixes:
with open(str(doc_path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
doc_path = Path(fn)
doc_paths.append(doc_path)
datasets = batch_build_dataset(doc_paths, 4, lang)
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
else:
parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__':
cli()
import json as json_parse
import os
from pathlib import Path
import click
import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, -1
else:
byte_start, byte_end = int(may_range_params[0]), int(
may_range_params[1])
return s3_rw.read_at(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
)
@click.group()
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
def cli():
pass
@cli.command()
@click.option(
'-j',
'--jsonl',
'jsonl',
type=str,
help='输入 jsonl 路径,本地或者 s3 上的文件',
required=True,
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='输出到本地目录',
)
def jsonl(jsonl, method, output_dir):
model_config.__use_inside_model__ = False
if jsonl.startswith('s3://'):
jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
else:
with open(jsonl) as f:
jso = json_parse.loads(f.readline())
os.makedirs(output_dir, exist_ok=True)
s3_file_path = jso.get('file_location')
if s3_file_path is None:
s3_file_path = jso.get('path')
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
print(pdf_file_name, jso, method)
do_parse(
output_dir,
pdf_file_name,
pdf_data,
jso['doc_layout_result'],
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
@cli.command()
@click.option(
'-p',
'--pdf',
'pdf',
type=click.Path(exists=True),
required=True,
help='本地 PDF 文件',
)
@click.option(
'-j',
'--json',
'json_data',
type=click.Path(exists=True),
required=True,
help='本地模型推理出的 json 数据',
)
@click.option('-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='本地输出目录')
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
def pdf(pdf, json_data, output_dir, method):
model_config.__use_inside_model__ = False
full_pdf_path = os.path.realpath(pdf)
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
file_name = str(Path(full_pdf_path).stem)
pdf_data = read_fn(full_pdf_path)
do_parse(
output_dir,
file_name,
pdf_data,
model_json_list,
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
if __name__ == '__main__':
cli()
import os
import click
import fitz
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import Dataset, PymuDocDataset
from magic_pdf.libs.draw_bbox import draw_char_bbox
from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
doc_analyze)
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
def prepare_env(output_dir, pdf_file_name, method):
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
local_image_dir = os.path.join(str(local_parent_dir), 'images')
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
# # 将字节数据包装在 BytesIO 对象中
# pdf_file = BytesIO(pdf_bytes)
# # 读取 PDF 的字节数据
# reader = PdfReader(pdf_file)
# # 创建一个新的 PDF 写入器
# writer = PdfWriter()
# # 将所有页面添加到新的 PDF 写入器中
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
# if end_page_id > len(reader.pages) - 1:
# logger.warning("end_page_id is out of range, use pdf_docs length")
# end_page_id = len(reader.pages) - 1
# for i, page in enumerate(reader.pages):
# if start_page_id <= i <= end_page_id:
# writer.add_page(page)
# # 创建一个字节缓冲区来存储输出的 PDF 数据
# output_buffer = BytesIO()
# # 将 PDF 写入字节缓冲区
# writer.write(output_buffer)
# # 获取字节缓冲区的内容
# converted_pdf_bytes = output_buffer.getvalue()
# return converted_pdf_bytes
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
document = fitz.open('pdf', pdf_bytes)
output_document = fitz.open()
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(document) - 1
)
if end_page_id > len(document) - 1:
logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(document) - 1
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
output_bytes = output_document.tobytes()
return output_bytes
def _do_parse(
output_dir,
pdf_file_name,
pdf_bytes_or_dataset,
model_list,
parse_method,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
from magic_pdf.operators.models import InferenceResult
if debug_able:
logger.warning('debug mode is on')
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
# f_draw_char_bbox = True
if isinstance(pdf_bytes_or_dataset, bytes):
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes_or_dataset, start_page_id, end_page_id
)
ds = PymuDocDataset(pdf_bytes, lang=lang)
else:
ds = pdf_bytes_or_dataset
pdf_bytes = ds._raw_data
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
if len(model_list) == 0:
if model_config.__use_inside_model__:
if parse_method == 'auto':
if ds.classify() == SupportedPdfParseMethod.TXT:
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=ds._lang
)
else:
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'txt':
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'ocr':
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
else:
logger.error('unknown parse method')
exit(1)
else:
logger.error('need model list input')
exit(2)
else:
infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'txt':
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=ds._lang
)
else:
if ds.classify() == SupportedPdfParseMethod.TXT:
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=ds._lang
)
else:
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
if f_draw_model_bbox:
infer_result.draw_model(
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
)
if f_draw_layout_bbox:
pipe_result.draw_layout(
os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
)
if f_draw_span_bbox:
pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
if f_draw_line_sort_bbox:
pipe_result.draw_line_sort(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
)
if f_draw_char_bbox:
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
if f_dump_md:
pipe_result.dump_md(
md_writer,
f'{pdf_file_name}.md',
image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode,
)
if f_dump_middle_json:
pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
if f_dump_model_json:
infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
if f_dump_orig_pdf:
md_writer.write(
f'{pdf_file_name}_origin.pdf',
pdf_bytes,
)
if f_dump_content_list:
pipe_result.dump_content_list(
md_writer,
f'{pdf_file_name}_content_list.json',
image_dir
)
logger.info(f'local output dir is {local_md_dir}')
def do_parse(
output_dir,
pdf_file_name,
pdf_bytes_or_dataset,
model_list,
parse_method,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
parallel_count = 1
if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
if parallel_count > 1:
if isinstance(pdf_bytes_or_dataset, bytes):
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes_or_dataset, start_page_id, end_page_id
)
ds = PymuDocDataset(pdf_bytes, lang=lang)
else:
ds = pdf_bytes_or_dataset
batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
else:
_do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
def batch_do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_or_datasets: list[bytes | Dataset],
parse_method,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
dss = []
for v in pdf_bytes_or_datasets:
if isinstance(v, bytes):
dss.append(PymuDocDataset(v, lang=lang))
else:
dss.append(v)
infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
for idx, infer_result in enumerate(infer_results):
_do_parse(
output_dir = output_dir,
pdf_file_name = pdf_file_names[idx],
pdf_bytes_or_dataset = dss[idx],
model_list = infer_result.get_infer_res(),
parse_method = parse_method,
debug_able = debug_able,
f_draw_span_bbox = f_draw_span_bbox,
f_draw_layout_bbox = f_draw_layout_bbox,
f_dump_md=f_dump_md,
f_dump_middle_json=f_dump_middle_json,
f_dump_model_json=f_dump_model_json,
f_dump_orig_pdf=f_dump_orig_pdf,
f_dump_content_list=f_dump_content_list,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=f_draw_model_bbox,
f_draw_line_sort_bbox=f_draw_line_sort_bbox,
f_draw_char_bbox=f_draw_char_bbox,
lang=lang,
)
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
from loguru import logger
def ImportPIL(f):
try:
import PIL # noqa: F401
except ImportError:
logger.error('Pillow not installed, please install by pip.')
exit(1)
return f
import os
import subprocess
import platform
from pathlib import Path
import shutil
from loguru import logger
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def check_fonts_installed():
"""Check if required Chinese fonts are installed."""
system_type = platform.system()
if system_type in ['Windows', 'Darwin']:
pass
else:
# Linux: use fc-list
try:
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
if output.strip(): # 只要有任何输出(非空)
return True
else:
logger.warning(
f"No Chinese fonts were detected, the converted document may not display Chinese content properly."
)
except Exception:
pass
def get_soffice_command():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type = platform.system()
# First check if soffice is in PATH
soffice_path = shutil.which('soffice')
if soffice_path:
return soffice_path
if system_type == 'Windows':
# Check common installation paths
possible_paths = [
Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
Path('C:/Program Files/LibreOffice/program/soffice.exe'),
Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
]
# Check other drives for windows
for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
for path in possible_paths:
if path.exists():
return str(path)
raise ConvertToPdfError(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else:
# For Linux/macOS, provide installation instructions if not found
try:
# Try to find soffice in standard locations
possible_paths = [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for path in possible_paths:
if os.path.exists(path):
return path
raise ConvertToPdfError(
"LibreOffice not found. Please install it:\n"
" - Ubuntu/Debian: sudo apt-get install libreoffice\n"
" - CentOS/RHEL: sudo yum install libreoffice\n"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
" - Or ensure soffice is in your PATH environment variable."
)
except Exception as e:
raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
def convert_file_to_pdf(input_path, output_dir):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
check_fonts_installed()
soffice_cmd = get_soffice_command()
cmd = [
soffice_cmd,
'--headless',
'--norestore',
'--invisible',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
......@@ -3,23 +3,6 @@
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu",
"layout-config": {
"model": "doclayout_yolo"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"sub_model": "slanet_plus",
"enable": true,
"max_time": 400
},
"latex-delimiter-config": {
"display": {
"left": "$$",
......@@ -31,18 +14,6 @@
}
},
"llm-aided-config": {
"formula_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"text_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct",
"enable": false
},
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
......@@ -50,5 +21,9 @@
"enable": false
}
},
"config_version": "1.2.1"
"models-dir": {
"pipeline": "",
"vlm": ""
},
"config_version": "1.3.0"
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment