Unverified Commit b4f7b53e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1261 from opendatalab/release-0.10.6

Release 0.10.6
parents a962824b d3b51aa5
......@@ -4,8 +4,8 @@ import statistics
import time
from typing import List
import torch
import fitz
import torch
from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod
......@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.model.magic_model import MagicModel
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
try:
import torchtext
if torchtext.__version__ >= "0.18.0":
if torchtext.__version__ >= '0.18.0':
torchtext.disable_torchtext_deprecation_warning()
except ImportError:
pass
......@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
def __replace_STX_ETX(text_str: str):
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
......@@ -233,7 +232,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
# 初始化ocr模型
atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model(
atom_model_name="ocr",
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3,
lang=lang
......@@ -241,7 +240,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
for span in empty_spans:
# 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
ocr_res = ocr_model.ocr(span_img, det=False)
if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0:
......@@ -681,7 +680,7 @@ def parse_page_core(
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if parse_mode == SupportedPdfParseMethod.TXT:
"""使用新版本的混合ocr方案"""
"""使用新版本的混合ocr方案."""
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
elif parse_mode == SupportedPdfParseMethod.OCR:
......@@ -689,7 +688,6 @@ def parse_page_core(
else:
raise Exception('parse_mode must be txt or ocr')
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4
......@@ -762,8 +760,8 @@ def parse_page_core(
def pdf_parse_union(
dataset: Dataset,
model_list,
dataset: Dataset,
imageWriter,
parse_mode,
start_page_id=0,
......@@ -771,6 +769,7 @@ def pdf_parse_union(
debug_mode=False,
lang=None,
):
pdf_bytes_md5 = compute_md5(dataset.data_bits())
"""初始化空的pdf_info_dict"""
......
......@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
......@@ -14,9 +15,9 @@ class AbsPipe(ABC):
PIP_OCR = 'ocr'
PIP_TXT = 'txt'
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
self.pdf_bytes = pdf_bytes
self.dataset = Dataset
self.model_list = model_list
self.image_writer = image_writer
self.pdf_mid_data = None # 未压缩
......
......@@ -2,40 +2,79 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
layout_model, formula_enable, table_enable)
def __init__(
self,
dataset: Dataset,
model_list: list,
image_writer: DataWriter,
is_debug: bool = False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
super().__init__(
dataset,
model_list,
image_writer,
is_debug,
start_page_id,
end_page_id,
lang,
layout_model,
formula_enable,
table_enable,
)
def pipe_classify(self):
pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
self.infer_res = doc_analyze(
self.dataset,
ocr=True,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_parse(self):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
self.pdf_mid_data = parse_ocr_pdf(
self.dataset,
self.infer_res,
self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info('ocr_pipe mk content list finished')
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
def pipe_mk_markdown(
self,
img_parent_path: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f'ocr_pipe mk {md_make_mode} finished')
return result
......@@ -2,6 +2,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf
......@@ -9,23 +10,23 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
layout_model, formula_enable, table_enable)
def pipe_classify(self):
pass
def pipe_analyze(self):
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
self.model_list = doc_analyze(self.dataset, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
def pipe_parse(self):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
......
......@@ -4,6 +4,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.commons import join_path
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
......@@ -12,12 +13,32 @@ from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
class UNIPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
start_page_id=0, end_page_id=None, lang=None,
layout_model=None, formula_enable=None, table_enable=None):
def __init__(
self,
dataset: Dataset,
jso_useful_key: dict,
image_writer: DataWriter,
is_debug: bool = False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
self.pdf_type = jso_useful_key['_pdf_type']
super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
lang, layout_model, formula_enable, table_enable)
super().__init__(
dataset,
jso_useful_key['model_list'],
image_writer,
is_debug,
start_page_id,
end_page_id,
lang,
layout_model,
formula_enable,
table_enable,
)
if len(self.model_list) == 0:
self.input_model_is_empty = True
else:
......@@ -28,35 +49,66 @@ class UNIPipe(AbsPipe):
def pipe_analyze(self):
if self.pdf_type == self.PIP_TXT:
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
self.model_list = doc_analyze(
self.dataset,
ocr=False,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
elif self.pdf_type == self.PIP_OCR:
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
self.model_list = doc_analyze(
self.dataset,
ocr=True,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
def pipe_parse(self):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang, layout_model=self.layout_model,
formula_enable=self.formula_enable, table_enable=self.table_enable)
self.pdf_mid_data = parse_union_pdf(
self.dataset,
self.model_list,
self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
layout_model=self.layout_model,
formula_enable=self.formula_enable,
table_enable=self.table_enable,
)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
lang=self.lang)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
self.pdf_mid_data = parse_ocr_pdf(
self.dataset,
self.model_list,
self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id,
end_page_id=self.end_page_id,
lang=self.lang,
)
def pipe_mk_uni_format(
self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
):
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info('uni_pipe mk content list finished')
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
def pipe_mk_markdown(
self,
img_parent_path: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f'uni_pipe mk {md_make_mode} finished')
return result
......@@ -65,6 +117,7 @@ class UNIPipe(AbsPipe):
if __name__ == '__main__':
# 测试
from magic_pdf.data.data_reader_writer import DataReader
drw = DataReader(r'D:/project/20231108code-clean')
pdf_file_path = r'linshixuqiu\19983-00.pdf'
......@@ -82,10 +135,7 @@ if __name__ == '__main__':
# "model_list": model_list
# }
jso_useful_key = {
'_pdf_type': '',
'model_list': model_list
}
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
pipe.pipe_classify()
pipe.pipe_parse()
......@@ -94,5 +144,7 @@ if __name__ == '__main__':
md_writer = DataWriter(write_path)
md_writer.write_string('19983-00.md', md_content)
md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
md_writer.write_string(
'19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
)
md_writer.write_string('19983-00.txt', str(content_list))
import json
import os
from typing import Callable
import copy
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_span_bbox)
from magic_pdf.libs.json_compressor import JsonCompressor
class PipeResult:
def __init__(self, pipe_res, dataset: Dataset):
"""Initialized.
Args:
pipe_res (list[dict]): the pipeline processed result of model inference result
dataset (Dataset): the dataset associated with pipe_res
"""
self._pipe_res = pipe_res
self._dataset = dataset
def dump_md(
self,
writer: DataWriter,
file_path: str,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD,
):
"""Dump The Markdown.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
pdf_info_list = self._pipe_res['pdf_info']
md_content = union_make(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
)
writer.write_string(file_path, md_content)
def dump_content_list(
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str
):
"""Dump Content List.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
"""
pdf_info_list = self._pipe_res['pdf_info']
content_list = union_make(
pdf_info_list,
MakeMode.STANDARD_FORMAT,
DropMode.NONE,
image_dir_or_bucket_prefix,
)
writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
)
def dump_middle_json(self, writer: DataWriter, file_path: str):
"""Dump the result of pipeline.
Args:
writer (DataWriter): File writer handler
file_path (str): The file location of middle json
"""
writer.write_string(
file_path, json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
)
def draw_layout(self, file_path: str) -> None:
"""Draw the layout.
Args:
file_path (str): The file location of layout result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def draw_span(self, file_path: str):
"""Draw the Span.
Args:
file_path (str): The file location of span result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def draw_line_sort(self, file_path: str):
"""Draw line sort.
Args:
file_path (str): The file location of line sort result file
"""
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name, exist_ok=True)
pdf_info = self._pipe_res['pdf_info']
draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
def get_compress_pdf_mid_data(self):
"""Compress the pipeline result.
Returns:
str: compress the pipeline result and return
"""
return JsonCompressor.compress_json(self.pdf_mid_data)
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
import copy
import json as json_parse
import os
import click
......@@ -7,13 +5,12 @@ import fitz
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_model_bbox, draw_span_bbox)
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
......@@ -56,7 +53,11 @@ def prepare_env(output_dir, pdf_file_name, method):
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
document = fitz.open('pdf', pdf_bytes)
output_document = fitz.open()
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(document) - 1
)
if end_page_id > len(document) - 1:
logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(document) - 1
......@@ -94,78 +95,126 @@ def do_parse(
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
if lang == "":
if lang == '':
lang = None
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id
)
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(
local_image_dir), FileBasedDataWriter(local_md_dir)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
# start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
else:
logger.error('unknown parse method')
exit(1)
pipe.pipe_classify()
ds = PymuDocDataset(pdf_bytes)
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
orig_model_list = copy.deepcopy(pipe.model_list)
if parse_method == 'auto':
if ds.classify() == SupportedPdfParseMethod.TXT:
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
else:
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'txt':
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'ocr':
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
else:
logger.error('unknown parse method')
exit(1)
else:
logger.error('need model list input')
exit(2)
else:
infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
)
elif parse_method == 'txt':
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
)
else:
pipe_result = infer_result.pipe_auto_mode(
image_writer, debug_mode=True, lang=lang
)
if f_draw_model_bbox:
infer_result.draw_model(
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
)
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
pipe_result.draw_layout(
os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
)
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
if f_draw_model_bbox:
draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
if f_draw_line_sort_bbox:
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
pipe_result.draw_line_sort(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
)
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
if f_dump_md:
md_writer.write_string(
pipe_result.dump_md(
md_writer,
f'{pdf_file_name}.md',
md_content
image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode,
)
if f_dump_middle_json:
md_writer.write_string(
f'{pdf_file_name}_middle.json',
json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
)
pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
if f_dump_model_json:
md_writer.write_string(
f'{pdf_file_name}_model.json',
json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
)
infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
if f_dump_orig_pdf:
md_writer.write(
......@@ -173,11 +222,11 @@ def do_parse(
pdf_bytes,
)
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
if f_dump_content_list:
md_writer.write_string(
pipe_result.dump_content_list(
md_writer,
f'{pdf_file_name}_content_list.json',
json_parse.dumps(content_list, ensure_ascii=False, indent=4)
image_dir
)
logger.info(f'local output dir is {local_md_dir}')
......
......@@ -10,22 +10,29 @@
from loguru import logger
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = 'txt'
PARSE_TYPE_OCR = 'ocr'
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
def parse_txt_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析文本类pdf."""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
......@@ -43,13 +50,21 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
def parse_ocr_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析ocr类pdf."""
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
......@@ -67,17 +82,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, i
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
def parse_union_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""ocr和文本混合的pdf,全部解析出来."""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
......@@ -91,12 +113,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
if input_model_is_empty:
if len(model_list) == 0:
layout_model = kwargs.get('layout_model', None)
formula_enable = kwargs.get('formula_enable', None)
table_enable = kwargs.get('table_enable', None)
pdf_models = doc_analyze(
pdf_bytes,
infer_res = doc_analyze(
dataset,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id,
......@@ -105,6 +127,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter,
formula_enable=formula_enable,
table_enable=table_enable,
)
model_list = infer_res.get_infer_res()
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
......
This diff is collapsed.
......@@ -7,3 +7,5 @@
api/read_api
api/schemas
api/io
api/pipe_operators
api/model_operators
\ No newline at end of file
Model Api
==========
.. autoclass:: magic_pdf.model.InferenceResultBase
:members:
:inherited-members:
:show-inheritance:
Pipeline Api
=============
.. autoclass:: magic_pdf.pipe.operators.PipeResult
:members:
:inherited-members:
:show-inheritance:
\ No newline at end of file
......@@ -114,7 +114,7 @@ autodoc_mock_imports = [
'sentencepiece',
'vllm.cuda_utils',
'vllm._C',
'numpy',
# 'numpy',
'tqdm',
]
......
......@@ -12,17 +12,17 @@ Local File Example
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args
model_list = []
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
## prepare env
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
......@@ -30,27 +30,31 @@ Local File Example
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
S3 File Example
......@@ -61,8 +65,8 @@ S3 File Example
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
......@@ -74,29 +78,39 @@ S3 File Example
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
......@@ -7,4 +7,6 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree::
:maxdepth: 1
tutorial/output_file_description
\ No newline at end of file
tutorial/output_file_description
tutorial/pipeline
Pipeline
==========
Minimal Example
^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Running the above code will result in the following
.. code:: bash
output/
├── abc.md
└── images
Excluding the setup of the environment, such as creating directories and importing dependencies, the actual code snippet for converting pdf to markdown is as follows
.. code:: python
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
``ds.apply(doc_analyze, ocr=True)`` generates an ``InferenceResult`` object. The ``InferenceResult`` object, when executing the ``pipe_ocr_mode`` method, produces a ``PipeResult`` object.
The ``PipeResult`` object, upon executing ``dump_md``, generates a ``markdown`` file at the specified location.
The pipeline execution process is illustrated in the following diagram
.. image:: ../../_static/image/pipeline.drawio.svg
.. raw:: html
<br> </br>
Currently, the process is divided into three stages: data, inference, and processing, which correspond to the ``Dataset``, ``InferenceResult``, and ``PipeResult`` entities in the diagram.
These stages are linked together through methods like ``apply``, ``doc_analyze``, or ``pipe_ocr_mode``
.. admonition:: Tip
:class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
Pipeline Composition
^^^^^^^^^^^^^^^^^^^^^
.. code:: python
class Dataset(ABC):
@abstractmethod
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
class InferenceResult(InferenceResultBase):
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._infer_res), *args, **kwargs)
def pipe_ocr_mode(
self,
imageWriter: DataWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
lang=None,
) -> PipeResult:
pass
class PipeResult:
def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
The ``Dataset``, ``InferenceResult``, and ``PipeResult`` classes all have an ``apply`` method, which can be used to chain different stages of the computation.
As shown below, ``MinerU`` provides a set of methods to compose these classes.
.. code:: python
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Users can implement their own functions for chaining as needed. For example, a user could use the ``apply`` method to create a function that counts the number of pages in a ``pdf`` file.
.. code:: python
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
def count_page(ds)-> int:
return len(ds)
print("page number: ", ds.apply(count_page)) # will output the page count of `abc.pdf`
numpy==1.26.4
click==8.1.7
fast-langdetect==0.2.2
Brotli==1.1.0
boto3>=1.28.43
loguru>=0.6.0
myst-parser
......@@ -9,4 +13,4 @@ sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3
sphinx-copybutton>=0.5.2
sphinx_rtd_theme>=3.0.1
autodoc_pydantic>=2.2.0
\ No newline at end of file
autodoc_pydantic>=2.2.0
This diff is collapsed.
转换为 Markdown 文件
========================
本地文件示例
^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args
model_list = []
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
## prepare env
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
......@@ -30,39 +28,43 @@
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
if isinstance(md_content, list):
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
对象存储使用示例
^^^^^^^^^^^^^^^
对象存储文件示例
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
......@@ -74,30 +76,39 @@
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
# args
pdf_file_name = (
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
......@@ -9,3 +9,5 @@
:caption: 教程
tutorial/output_file_description
tutorial/pipeline
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment