Unverified Commit b4f7b53e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1261 from opendatalab/release-0.10.6

Release 0.10.6
parents a962824b d3b51aa5
...@@ -4,8 +4,8 @@ import statistics ...@@ -4,8 +4,8 @@ import statistics
import time import time
from typing import List from typing import List
import torch
import fitz import fitz
import torch
from loguru import logger from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
...@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory ...@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
try: try:
import torchtext import torchtext
if torchtext.__version__ >= "0.18.0": if torchtext.__version__ >= '0.18.0':
torchtext.disable_torchtext_deprecation_warning() torchtext.disable_torchtext_deprecation_warning()
except ImportError: except ImportError:
pass pass
...@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo ...@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
def __replace_STX_ETX(text_str: str): def __replace_STX_ETX(text_str: str):
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks. """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
...@@ -233,7 +232,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -233,7 +232,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
# 初始化ocr模型 # 初始化ocr模型
atom_model_manager = AtomModelSingleton() atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model( ocr_model = atom_model_manager.get_atom_model(
atom_model_name="ocr", atom_model_name='ocr',
ocr_show_log=False, ocr_show_log=False,
det_db_box_thresh=0.3, det_db_box_thresh=0.3,
lang=lang lang=lang
...@@ -241,7 +240,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -241,7 +240,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
for span in empty_spans: for span in empty_spans:
# 对span的bbox截图再ocr # 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2") span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
ocr_res = ocr_model.ocr(span_img, det=False) ocr_res = ocr_model.ocr(span_img, det=False)
if ocr_res and len(ocr_res) > 0: if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0: if len(ocr_res[0]) > 0:
...@@ -681,7 +680,7 @@ def parse_page_core( ...@@ -681,7 +680,7 @@ def parse_page_core(
"""根据parse_mode,构造spans,主要是文本类的字符填充""" """根据parse_mode,构造spans,主要是文本类的字符填充"""
if parse_mode == SupportedPdfParseMethod.TXT: if parse_mode == SupportedPdfParseMethod.TXT:
"""使用新版本的混合ocr方案""" """使用新版本的混合ocr方案."""
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang) spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
elif parse_mode == SupportedPdfParseMethod.OCR: elif parse_mode == SupportedPdfParseMethod.OCR:
...@@ -689,7 +688,6 @@ def parse_page_core( ...@@ -689,7 +688,6 @@ def parse_page_core(
else: else:
raise Exception('parse_mode must be txt or ocr') raise Exception('parse_mode must be txt or ocr')
"""先处理不需要排版的discarded_blocks""" """先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks( discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4 all_discarded_blocks, spans, 0.4
...@@ -762,8 +760,8 @@ def parse_page_core( ...@@ -762,8 +760,8 @@ def parse_page_core(
def pdf_parse_union( def pdf_parse_union(
dataset: Dataset,
model_list, model_list,
dataset: Dataset,
imageWriter, imageWriter,
parse_mode, parse_mode,
start_page_id=0, start_page_id=0,
...@@ -771,6 +769,7 @@ def pdf_parse_union( ...@@ -771,6 +769,7 @@ def pdf_parse_union(
debug_mode=False, debug_mode=False,
lang=None, lang=None,
): ):
pdf_bytes_md5 = compute_md5(dataset.data_bits()) pdf_bytes_md5 = compute_md5(dataset.data_bits())
"""初始化空的pdf_info_dict""" """初始化空的pdf_info_dict"""
......
...@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod ...@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
from magic_pdf.config.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.dict2md.ocr_mkcontent import union_make from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.filter.pdf_classify_by_type import classify from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
...@@ -14,9 +15,9 @@ class AbsPipe(ABC): ...@@ -14,9 +15,9 @@ class AbsPipe(ABC):
PIP_OCR = 'ocr' PIP_OCR = 'ocr'
PIP_TXT = 'txt' PIP_TXT = 'txt'
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False, def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None): start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
self.pdf_bytes = pdf_bytes self.dataset = Dataset
self.model_list = model_list self.model_list = model_list
self.image_writer = image_writer self.image_writer = image_writer
self.pdf_mid_data = None # 未压缩 self.pdf_mid_data = None # 未压缩
......
...@@ -2,40 +2,79 @@ from loguru import logger ...@@ -2,40 +2,79 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe): class OCRPipe(AbsPipe):
def __init__(
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False, self,
start_page_id=0, end_page_id=None, lang=None, dataset: Dataset,
layout_model=None, formula_enable=None, table_enable=None): model_list: list,
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang, image_writer: DataWriter,
layout_model, formula_enable, table_enable) is_debug: bool = False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
super().__init__(
dataset,
model_list,
image_writer,
is_debug,
start_page_id,
end_page_id,
lang,
layout_model,
formula_enable,
table_enable,
)
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.infer_res = doc_analyze(
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.dataset,
lang=self.lang, layout_model=self.layout_model, ocr=True,
formula_enable=self.formula_enable, table_enable=self.table_enable) start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_ocr_pdf(
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.dataset,
lang=self.lang, layout_model=self.layout_model, self.infer_res,
formula_enable=self.formula_enable, table_enable=self.table_enable) self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info('ocr_pipe mk content list finished') logger.info('ocr_pipe mk content list finished')
return result return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): def pipe_mk_markdown(
self,
img_parent_path: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f'ocr_pipe mk {md_make_mode} finished') logger.info(f'ocr_pipe mk {md_make_mode} finished')
return result return result
...@@ -2,6 +2,7 @@ from loguru import logger ...@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf from magic_pdf.user_api import parse_txt_pdf
...@@ -9,23 +10,23 @@ from magic_pdf.user_api import parse_txt_pdf ...@@ -9,23 +10,23 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe): class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False, def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None, start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None): layout_model=None, formula_enable=None, table_enable=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang, super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
layout_model, formula_enable, table_enable) layout_model, formula_enable, table_enable)
def pipe_classify(self): def pipe_classify(self):
pass pass
def pipe_analyze(self): def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(self.dataset, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model, lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable) formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_parse(self): def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug, self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model, lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable) formula_enable=self.formula_enable, table_enable=self.table_enable)
......
...@@ -4,6 +4,7 @@ from loguru import logger ...@@ -4,6 +4,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
...@@ -12,12 +13,32 @@ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf ...@@ -12,12 +13,32 @@ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
class UNIPipe(AbsPipe): class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False, def __init__(
start_page_id=0, end_page_id=None, lang=None, self,
layout_model=None, formula_enable=None, table_enable=None): dataset: Dataset,
jso_useful_key: dict,
image_writer: DataWriter,
is_debug: bool = False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
self.pdf_type = jso_useful_key['_pdf_type'] self.pdf_type = jso_useful_key['_pdf_type']
super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id, super().__init__(
lang, layout_model, formula_enable, table_enable) dataset,
jso_useful_key['model_list'],
image_writer,
is_debug,
start_page_id,
end_page_id,
lang,
layout_model,
formula_enable,
table_enable,
)
if len(self.model_list) == 0: if len(self.model_list) == 0:
self.input_model_is_empty = True self.input_model_is_empty = True
else: else:
...@@ -28,35 +49,66 @@ class UNIPipe(AbsPipe): ...@@ -28,35 +49,66 @@ class UNIPipe(AbsPipe):
def pipe_analyze(self): def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False, self.model_list = doc_analyze(
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.dataset,
lang=self.lang, layout_model=self.layout_model, ocr=False,
formula_enable=self.formula_enable, table_enable=self.table_enable) start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True, self.model_list = doc_analyze(
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.dataset,
lang=self.lang, layout_model=self.layout_model, ocr=True,
formula_enable=self.formula_enable, table_enable=self.table_enable) start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_parse(self): def pipe_parse(self):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_union_pdf(
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty, self.dataset,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.model_list,
lang=self.lang, layout_model=self.layout_model, self.image_writer,
formula_enable=self.formula_enable, table_enable=self.table_enable) is_debug=self.is_debug,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(
is_debug=self.is_debug, self.dataset,
start_page_id=self.start_page_id, end_page_id=self.end_page_id, self.model_list,
lang=self.lang) self.image_writer,
is_debug=self.is_debug,
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON): start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
)
def pipe_mk_uni_format(
self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode) result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info('uni_pipe mk content list finished') logger.info('uni_pipe mk content list finished')
return result return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): def pipe_mk_markdown(
self,
img_parent_path: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f'uni_pipe mk {md_make_mode} finished') logger.info(f'uni_pipe mk {md_make_mode} finished')
return result return result
...@@ -65,6 +117,7 @@ class UNIPipe(AbsPipe): ...@@ -65,6 +117,7 @@ class UNIPipe(AbsPipe):
if __name__ == '__main__': if __name__ == '__main__':
# 测试 # 测试
from magic_pdf.data.data_reader_writer import DataReader from magic_pdf.data.data_reader_writer import DataReader
drw = DataReader(r'D:/project/20231108code-clean') drw = DataReader(r'D:/project/20231108code-clean')
pdf_file_path = r'linshixuqiu\19983-00.pdf' pdf_file_path = r'linshixuqiu\19983-00.pdf'
...@@ -82,10 +135,7 @@ if __name__ == '__main__': ...@@ -82,10 +135,7 @@ if __name__ == '__main__':
# "model_list": model_list # "model_list": model_list
# } # }
jso_useful_key = { jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
'_pdf_type': '',
'model_list': model_list
}
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer) pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
pipe.pipe_classify() pipe.pipe_classify()
pipe.pipe_parse() pipe.pipe_parse()
...@@ -94,5 +144,7 @@ if __name__ == '__main__': ...@@ -94,5 +144,7 @@ if __name__ == '__main__':
md_writer = DataWriter(write_path) md_writer = DataWriter(write_path)
md_writer.write_string('19983-00.md', md_content) md_writer.write_string('19983-00.md', md_content)
md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)) md_writer.write_string(
'19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
)
md_writer.write_string('19983-00.txt', str(content_list)) md_writer.write_string('19983-00.txt', str(content_list))
import json
import os
from typing import Callable
import copy
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_span_bbox)
from magic_pdf.libs.json_compressor import JsonCompressor
class PipeResult:
def __init__(self, pipe_res, dataset: Dataset):
"""Initialized.
Args:
pipe_res (list[dict]): the pipeline processed result of model inference result
dataset (Dataset): the dataset associated with pipe_res
"""
self._pipe_res = pipe_res
self._dataset = dataset
def dump_md(
self,
writer: DataWriter,
file_path: str,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
"""Dump The Markdown.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
pdf_info_list = self._pipe_res['pdf_info']
md_content = union_make(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
)
writer.write_string(file_path, md_content)
def dump_content_list(
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
):
"""Dump Content List.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
"""
pdf_info_list = self._pipe_res['pdf_info']
content_list = union_make(
pdf_info_list,
MakeMode.STANDARD_FORMAT,
DropMode.NONE,
image_dir_or_bucket_prefix,
)
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
)
def dump_middle_json(self, writer: DataWriter, file_path: str):
"""Dump the result of pipeline.
Args:
writer (DataWriter): File writer handler
file_path (str): The file location of middle json
"""
writer.write_string(
file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
)
def draw_layout(self, file_path: str) -> None:
"""Draw the layout.
Args:
file_path (str): The file location of layout result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def draw_span(self, file_path: str):
"""Draw the Span.
Args:
file_path (str): The file location of span result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def draw_line_sort(self, file_path: str):
"""Draw line sort.
Args:
file_path (str): The file location of line sort result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def get_compress_pdf_mid_data(self):
"""Compress the pipeline result.
Returns:
str: compress the pipeline result and return
"""
return JsonCompressor.compress_json(self.pdf_mid_data)
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
import copy
import json as json_parse
import os import os
import click import click
...@@ -7,13 +5,12 @@ import fitz ...@@ -7,13 +5,12 @@ import fitz
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox, from magic_pdf.data.dataset import PymuDocDataset
draw_model_bbox, draw_span_bbox) from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.model.operators import InferenceResult
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
# from io import BytesIO # from io import BytesIO
# from pypdf import PdfReader, PdfWriter # from pypdf import PdfReader, PdfWriter
...@@ -56,7 +53,11 @@ def prepare_env(output_dir, pdf_file_name, method): ...@@ -56,7 +53,11 @@ def prepare_env(output_dir, pdf_file_name, method):
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None): def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
document = fitz.open('pdf', pdf_bytes) document = fitz.open('pdf', pdf_bytes)
output_document = fitz.open() output_document = fitz.open()
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1 end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(document) - 1
)
if end_page_id > len(document) - 1: if end_page_id > len(document) - 1:
logger.warning('end_page_id is out of range, use pdf_docs length') logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(document) - 1 end_page_id = len(document) - 1
...@@ -94,78 +95,126 @@ def do_parse( ...@@ -94,78 +95,126 @@ def do_parse(
f_draw_model_bbox = True f_draw_model_bbox = True
f_draw_line_sort_bbox = True f_draw_line_sort_bbox = True
if lang == "": if lang == '':
lang = None lang = None
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id) pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id
)
orig_model_list = copy.deepcopy(model_list) local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
image_writer, md_writer = FileBasedDataWriter( image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_image_dir), FileBasedDataWriter(local_md_dir) local_md_dir
)
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
if parse_method == 'auto': ds = PymuDocDataset(pdf_bytes)
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
else:
logger.error('unknown parse method')
exit(1)
pipe.pipe_classify()
if len(model_list) == 0: if len(model_list) == 0:
if model_config.__use_inside_model__: if model_config.__use_inside_model__:
pipe.pipe_analyze() if parse_method == 'auto':
orig_model_list = copy.deepcopy(pipe.model_list) if ds.classify() == SupportedPdfParseMethod.TXT:
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
else:
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'txt':
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'ocr':
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
else:
logger.error('unknown parse method')
exit(1)
else: else:
logger.error('need model list input') logger.error('need model list input')
exit(2) exit(2)
else:
infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'txt':
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
else:
pipe_result = infer_result.pipe_auto_mode(
image_writer, debug_mode=True, lang=lang
)
if f_draw_model_bbox:
infer_result.draw_model(
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
)
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox: if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) pipe_result.draw_layout(
os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
)
if f_draw_span_bbox: if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
if f_draw_model_bbox:
draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
if f_draw_line_sort_bbox: if f_draw_line_sort_bbox:
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) pipe_result.draw_line_sort(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
)
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
if f_dump_md: if f_dump_md:
md_writer.write_string( pipe_result.dump_md(
md_writer,
f'{pdf_file_name}.md', f'{pdf_file_name}.md',
md_content image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode,
) )
if f_dump_middle_json: if f_dump_middle_json:
md_writer.write_string( pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
f'{pdf_file_name}_middle.json',
json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
)
if f_dump_model_json: if f_dump_model_json:
md_writer.write_string( infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
f'{pdf_file_name}_model.json',
json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
)
if f_dump_orig_pdf: if f_dump_orig_pdf:
md_writer.write( md_writer.write(
...@@ -173,11 +222,11 @@ def do_parse( ...@@ -173,11 +222,11 @@ def do_parse(
pdf_bytes, pdf_bytes,
) )
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
if f_dump_content_list: if f_dump_content_list:
md_writer.write_string( pipe_result.dump_content_list(
md_writer,
f'{pdf_file_name}_content_list.json', f'{pdf_file_name}_content_list.json',
json_parse.dumps(content_list, ensure_ascii=False, indent=4) image_dir
) )
logger.info(f'local output dir is {local_md_dir}') logger.info(f'local output dir is {local_md_dir}')
......
...@@ -10,22 +10,29 @@ ...@@ -10,22 +10,29 @@
from loguru import logger from loguru import logger
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
PARSE_TYPE_TXT = 'txt'
PARSE_TYPE_OCR = 'ocr'
def parse_txt_pdf(
dataset: Dataset,
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False, model_list: list,
start_page_id=0, end_page_id=None, lang=None, imageWriter: DataWriter,
*args, **kwargs): is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析文本类pdf.""" """解析文本类pdf."""
pdf_info_dict = parse_pdf_by_txt( pdf_info_dict = parse_pdf_by_txt(
pdf_bytes, dataset,
pdf_models, model_list,
imageWriter, imageWriter,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
...@@ -43,13 +50,21 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i ...@@ -43,13 +50,21 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
return pdf_info_dict return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False, def parse_ocr_pdf(
start_page_id=0, end_page_id=None, lang=None, dataset: Dataset,
*args, **kwargs): model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析ocr类pdf.""" """解析ocr类pdf."""
pdf_info_dict = parse_pdf_by_ocr( pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes, dataset,
pdf_models, model_list,
imageWriter, imageWriter,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
...@@ -67,17 +82,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i ...@@ -67,17 +82,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
return pdf_info_dict return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False, def parse_union_pdf(
input_model_is_empty: bool = False, dataset: Dataset,
start_page_id=0, end_page_id=None, lang=None, model_list: list,
*args, **kwargs): imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""ocr和文本混合的pdf,全部解析出来.""" """ocr和文本混合的pdf,全部解析出来."""
def parse_pdf(method): def parse_pdf(method):
try: try:
return method( return method(
pdf_bytes, dataset,
pdf_models, model_list,
imageWriter, imageWriter,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
...@@ -91,12 +113,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, ...@@ -91,12 +113,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
pdf_info_dict = parse_pdf(parse_pdf_by_txt) pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False): if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr') logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
if input_model_is_empty: if len(model_list) == 0:
layout_model = kwargs.get('layout_model', None) layout_model = kwargs.get('layout_model', None)
formula_enable = kwargs.get('formula_enable', None) formula_enable = kwargs.get('formula_enable', None)
table_enable = kwargs.get('table_enable', None) table_enable = kwargs.get('table_enable', None)
pdf_models = doc_analyze( infer_res = doc_analyze(
pdf_bytes, dataset,
ocr=True, ocr=True,
start_page_id=start_page_id, start_page_id=start_page_id,
end_page_id=end_page_id, end_page_id=end_page_id,
...@@ -105,6 +127,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, ...@@ -105,6 +127,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
formula_enable=formula_enable, formula_enable=formula_enable,
table_enable=table_enable, table_enable=table_enable,
) )
model_list = infer_res.get_infer_res()
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.') raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
......
This diff is collapsed.
...@@ -7,3 +7,5 @@ ...@@ -7,3 +7,5 @@
api/read_api api/read_api
api/schemas api/schemas
api/io api/io
api/pipe_operators
api/model_operators
\ No newline at end of file
Model Api
==========
.. autoclass:: magic_pdf.model.InferenceResultBase
:members:
:inherited-members:
:show-inheritance:
Pipeline Api
=============
.. autoclass:: magic_pdf.pipe.operators.PipeResult
:members:
:inherited-members:
:show-inheritance:
\ No newline at end of file
...@@ -114,7 +114,7 @@ autodoc_mock_imports = [ ...@@ -114,7 +114,7 @@ autodoc_mock_imports = [
'sentencepiece', 'sentencepiece',
'vllm.cuda_utils', 'vllm.cuda_utils',
'vllm._C', 'vllm._C',
'numpy', # 'numpy',
'tqdm', 'tqdm',
] ]
......
...@@ -12,17 +12,17 @@ Local File Example ...@@ -12,17 +12,17 @@ Local File Example
import os import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
## args
model_list = []
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
## prepare env
local_image_dir, local_md_dir = "output/images", "output" local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
...@@ -30,27 +30,31 @@ Local File Example ...@@ -30,27 +30,31 @@ Local File Example
) )
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("") reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer) ## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify() ### draw model result on each page
pipe.pipe_analyze() infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"] ## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown( ### draw spans result on each page
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
)
if isinstance(md_content, list): ### dump markdown
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
S3 File Example S3 File Example
...@@ -61,8 +65,8 @@ S3 File Example ...@@ -61,8 +65,8 @@ S3 File Example
import os import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key ak = "{Your S3 access key}" # replace with real s3 access key
...@@ -74,29 +78,39 @@ S3 File Example ...@@ -74,29 +78,39 @@ S3 File Example
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url) writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url) image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args # args
model_list = [] pdf_file_name = (
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer) ## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify() ### draw model result on each page
pipe.pipe_analyze() infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"] ## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown( ### draw layout result on each page
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
if isinstance(md_content, list): ### dump markdown
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
...@@ -7,4 +7,6 @@ From the beginning to the end, Show how to using mineru via a minimal project ...@@ -7,4 +7,6 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
tutorial/output_file_description tutorial/output_file_description
\ No newline at end of file tutorial/pipeline
Pipeline
==========
Minimal Example
^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Running the above code will result in the following
.. code:: bash
output/
├── abc.md
└── images
Excluding the setup of the environment, such as creating directories and importing dependencies, the actual code snippet for converting pdf to markdown is as follows
.. code:: python
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
``ds.apply(doc_analyze, ocr=True)`` generates an ``InferenceResult`` object. The ``InferenceResult`` object, when executing the ``pipe_ocr_mode`` method, produces a ``PipeResult`` object.
The ``PipeResult`` object, upon executing ``dump_md``, generates a ``markdown`` file at the specified location.
The pipeline execution process is illustrated in the following diagram
.. image:: ../../_static/image/pipeline.drawio.svg
.. raw:: html
<br> </br>
Currently, the process is divided into three stages: data, inference, and processing, which correspond to the ``Dataset``, ``InferenceResult``, and ``PipeResult`` entities in the diagram.
These stages are linked together through methods like ``apply``, ``doc_analyze``, or ``pipe_ocr_mode``
.. admonition:: Tip
:class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
Pipeline Composition
^^^^^^^^^^^^^^^^^^^^^
.. code:: python
class Dataset(ABC):
@abstractmethod
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
class InferenceResult(InferenceResultBase):
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_ocr_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
pass
class PipeResult:
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
The ``Dataset``, ``InferenceResult``, and ``PipeResult`` classes all have an ``apply`` method, which can be used to chain different stages of the computation.
As shown below, ``MinerU`` provides a set of methods to compose these classes.
.. code:: python
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Users can implement their own functions for chaining as needed. For example, a user could use the ``apply`` method to create a function that counts the number of pages in a ``pdf`` file.
.. code:: python
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
def count_page(ds)-> int:
return len(ds)
print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
numpy==1.26.4
click==8.1.7
fast-langdetect==0.2.2
Brotli==1.1.0
boto3>=1.28.43 boto3>=1.28.43
loguru>=0.6.0 loguru>=0.6.0
myst-parser myst-parser
...@@ -9,4 +13,4 @@ sphinx-argparse>=0.5.2 ...@@ -9,4 +13,4 @@ sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3 sphinx-book-theme>=1.1.3
sphinx-copybutton>=0.5.2 sphinx-copybutton>=0.5.2
sphinx_rtd_theme>=3.0.1 sphinx_rtd_theme>=3.0.1
autodoc_pydantic>=2.2.0 autodoc_pydantic>=2.2.0
\ No newline at end of file
This diff is collapsed.
转换为 Markdown 文件 转换为 Markdown 文件
======================== ========================
本地文件示例 本地文件示例
^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
.. code:: python .. code:: python
import os import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args # args
model_list = []
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
## prepare env
local_image_dir, local_md_dir = "output/images", "output" local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True) os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
...@@ -30,39 +28,43 @@ ...@@ -30,39 +28,43 @@
) )
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("") reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer) ## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify() ### draw model result on each page
pipe.pipe_analyze() infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"] ## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown( ### draw spans result on each page
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
)
if isinstance(md_content, list): ### dump markdown
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
对象存储使用示例 对象存储文件示例
^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
.. code:: python .. code:: python
import os import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key ak = "{Your S3 access key}" # replace with real s3 access key
...@@ -74,30 +76,39 @@ ...@@ -74,30 +76,39 @@
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url) writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url) image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args # args
model_list = [] pdf_file_name = (
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer) ## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify() ### draw model result on each page
pipe.pipe_analyze() infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"] ## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown( ### draw layout result on each page
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
)
if isinstance(md_content, list): ### draw spans result on each page
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例 前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
...@@ -9,3 +9,5 @@ ...@@ -9,3 +9,5 @@
:caption: 教程 :caption: 教程
tutorial/output_file_description tutorial/output_file_description
tutorial/pipeline
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment