Unverified Commit d0a3058b authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1280 from icecraft/docs/tune_docs

Docs/tune docs
parents 91aa7e56 f49d261d
import copy
import json import json
import os import os
from typing import Callable from typing import Callable
import copy
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
...@@ -23,6 +23,26 @@ class PipeResult: ...@@ -23,6 +23,26 @@ class PipeResult:
self._pipe_res = pipe_res self._pipe_res = pipe_res
self._dataset = dataset self._dataset = dataset
def get_markdown(self,
img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF,
md_make_mode=MakeMode.MM_MD) -> str:
"""Get markdown content.
Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns:
str: return markdown content
"""
pdf_info_list = self._pipe_res['pdf_info']
md_content = union_make(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
)
return md_content
def dump_md( def dump_md(
self, self,
writer: DataWriter, writer: DataWriter,
...@@ -40,14 +60,40 @@ class PipeResult: ...@@ -40,14 +60,40 @@ class PipeResult:
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
""" """
md_content = self.get_markdown(img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
writer.write_string(file_path, md_content)
def get_content_list(self,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT) -> str:
"""Get Content List.
Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
Returns:
str: content list content
"""
pdf_info_list = self._pipe_res['pdf_info'] pdf_info_list = self._pipe_res['pdf_info']
md_content = union_make( content_list = union_make(
pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix pdf_info_list,
md_make_mode,
drop_mode,
image_dir_or_bucket_prefix,
) )
writer.write_string(file_path, md_content) return content_list
def dump_content_list( def dump_content_list(
self, writer: DataWriter, file_path: str, image_dir_or_bucket_prefix: str self,
writer: DataWriter,
file_path: str,
image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT
): ):
"""Dump Content List. """Dump Content List.
...@@ -55,14 +101,10 @@ class PipeResult: ...@@ -55,14 +101,10 @@ class PipeResult:
writer (DataWriter): File writer handle writer (DataWriter): File writer handle
file_path (str): The file location of content list file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
""" """
pdf_info_list = self._pipe_res['pdf_info'] content_list = self.get_content_list(image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode)
content_list = union_make(
pdf_info_list,
MakeMode.STANDARD_FORMAT,
DropMode.NONE,
image_dir_or_bucket_prefix,
)
writer.write_string( writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4) file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
) )
...@@ -123,7 +165,7 @@ class PipeResult: ...@@ -123,7 +165,7 @@ class PipeResult:
Returns: Returns:
str: compress the pipeline result and return str: compress the pipeline result and return
""" """
return JsonCompressor.compress_json(self.pdf_mid_data) return JsonCompressor.compress_json(self._pipe_res)
def apply(self, proc: Callable, *args, **kwargs): def apply(self, proc: Callable, *args, **kwargs):
"""Apply callable method which. """Apply callable method which.
......
This diff is collapsed.
This diff is collapsed.
...@@ -83,7 +83,24 @@ Read images from path or directory ...@@ -83,7 +83,24 @@ Read images from path or directory
datasets = read_local_images("tt.png") # replace with real file path datasets = read_local_images("tt.png") # replace with real file path
# read files from directory that endswith suffix in suffixes array # read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"]) # replace with real directory datasets = read_local_images("images/", suffixes=[".png", ".jpg"]) # replace with real directory
read_local_office
^^^^^^^^^^^^^^^^^^^^
Read MS-Office files from path or directory
.. code:: python
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_office("tt.doc") # replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_office("docs/") # replace with real directory
Check :doc:`../../api/read_api` for more details Check :doc:`../../api/read_api` for more details
\ No newline at end of file
...@@ -5,6 +5,14 @@ Config ...@@ -5,6 +5,14 @@ Config
File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system. File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
.. admonition:: Tip
:class: tip
You can override the default location of config file via the following command:
export MINERU_TOOLS_CONFIG_JSON=new_magic_pdf.json
magic-pdf.json magic-pdf.json
---------------- ----------------
......
...@@ -7,7 +7,7 @@ Convert Doc ...@@ -7,7 +7,7 @@ Convert Doc
:class: tip :class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
...@@ -15,15 +15,15 @@ Convert Doc ...@@ -15,15 +15,15 @@ Convert Doc
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.doc -o output -m auto magic-pdf -p a.doc -o output -m auto
API API
^^^^^^^^ ^^^^^^^^
.. code:: python .. code:: python
import os import os
...@@ -44,13 +44,16 @@ API ...@@ -44,13 +44,16 @@ API
# proc # proc
## Create Dataset Instance ## Create Dataset Instance
input_file = "some_doc.doc" # replace with real ms-office file input_file = "some_doc.doc" # replace with real ms-office file
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -6,23 +6,23 @@ Convert DocX ...@@ -6,23 +6,23 @@ Convert DocX
:class: tip :class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto magic-pdf -p a.docx -o output -m auto
API API
^^^^^ ^^^^^
.. code:: python .. code:: python
import os import os
...@@ -43,11 +43,16 @@ API ...@@ -43,11 +43,16 @@ API
# proc # proc
## Create Dataset Instance ## Create Dataset Instance
input_file = "some_docx.docx" # replace with real ms-office file input_file = "some_docx.docx" # replace with real ms-office file
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -7,13 +7,13 @@ Convert Image ...@@ -7,13 +7,13 @@ Convert Image
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.png -o output -m auto magic-pdf -p a.png -o output -m auto
API API
^^^^^^ ^^^^^^
.. code:: python .. code:: python
...@@ -41,6 +41,12 @@ API ...@@ -41,6 +41,12 @@ API
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0] ds = read_local_images(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert PDF Convert PDF
============ ============
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.pdf -o output -m auto magic-pdf -p a.pdf -o output -m auto
...@@ -44,6 +44,12 @@ API ...@@ -44,6 +44,12 @@ API
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) # ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
Convert PPT Convert PPT
============ ============
.. admonition:: Warning .. admonition:: Warning
:class: tip :class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto magic-pdf -p a.ppt -o output -m auto
API API
^^^^^ ^^^^^
.. code:: python .. code:: python
import os import os
...@@ -43,10 +43,16 @@ API ...@@ -43,10 +43,16 @@ API
# proc # proc
## Create Dataset Instance ## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -7,14 +7,14 @@ Convert PPTX ...@@ -7,14 +7,14 @@ Convert PPTX
:class: tip :class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line Command Line
^^^^^^^^^^^^^ ^^^^^^^^^^^^^
.. code:: python .. code:: python
# make sure the file have correct suffix # make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto magic-pdf -p a.pptx -o output -m auto
...@@ -22,10 +22,10 @@ Command Line ...@@ -22,10 +22,10 @@ Command Line
API API
^^^^^^ ^^^^^^
.. code:: python .. code:: python
import os import os
...@@ -46,10 +46,16 @@ API ...@@ -46,10 +46,16 @@ API
# proc # proc
## Create Dataset Instance ## Create Dataset Instance
input_file = "some_pptx.pptx" # replace with real ms-office file input_file = "some_pptx.pptx" # replace with real ms-office file
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Api Usage Api Usage
=========== ===========
...@@ -16,6 +16,7 @@ Local File Example ...@@ -16,6 +16,7 @@ Local File Example
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args # args
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
...@@ -40,15 +41,22 @@ Local File Example ...@@ -40,15 +41,22 @@ Local File Example
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page ### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...@@ -58,6 +66,9 @@ Local File Example ...@@ -58,6 +66,9 @@ Local File Example
### dump markdown ### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
S3 File Example S3 File Example
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
...@@ -96,30 +107,39 @@ S3 File Example ...@@ -96,30 +107,39 @@ S3 File Example
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
### draw model result on each page ## pipeline
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local pipe_result = infer_result.pipe_ocr_mode(image_writer)
## pipeline else:
pipe_result = infer_result.pipe_ocr_mode(image_writer) infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page ### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### dump markdown ### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3 pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
MS-Office MS-Office
---------- ----------
.. code:: python .. code:: python
import os import os
...@@ -144,7 +164,7 @@ MS-Office ...@@ -144,7 +164,7 @@ MS-Office
input_file_name = input_file.split(".")[0] input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0] ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir md_writer, f"{input_file_name}.md", image_dir
) )
...@@ -154,7 +174,7 @@ This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** ...@@ -154,7 +174,7 @@ This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx**
Image Image
--------- ---------
Single Image File Single Image File
^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^
.. code:: python .. code:: python
...@@ -187,7 +207,7 @@ Single Image File ...@@ -187,7 +207,7 @@ Single Image File
) )
Directory That Contains Images Directory That Contains Images
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python .. code:: python
...@@ -213,7 +233,7 @@ Directory That Contains Images ...@@ -213,7 +233,7 @@ Directory That Contains Images
input_directory = "some_image_dir/" # replace with real directory that contains images input_directory = "some_image_dir/" # replace with real directory that contains images
dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0] dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
count = 0 count = 0
for ds in dss: for ds in dss:
......
...@@ -8,6 +8,7 @@ myst-parser ...@@ -8,6 +8,7 @@ myst-parser
Pillow==8.4.0 Pillow==8.4.0
pydantic>=2.7.2,<2.8.0 pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9 PyMuPDF>=1.24.9
pdfminer.six==20231228
sphinx sphinx
sphinx-argparse>=0.5.2 sphinx-argparse>=0.5.2
sphinx-book-theme>=1.1.3 sphinx-book-theme>=1.1.3
......
...@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse ...@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.model.operators import InferenceResult
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
...@@ -20,14 +21,15 @@ app = FastAPI() ...@@ -20,14 +21,15 @@ app = FastAPI()
def json_md_dump( def json_md_dump(
pipe, model_json,
middle_json,
md_writer, md_writer,
pdf_name, pdf_name,
content_list, content_list,
md_content, md_content,
): ):
# Write model results to model.json # Write model results to model.json
orig_model_list = copy.deepcopy(pipe.model_list) orig_model_list = copy.deepcopy(model_json)
md_writer.write_string( md_writer.write_string(
f'{pdf_name}_model.json', f'{pdf_name}_model.json',
json.dumps(orig_model_list, ensure_ascii=False, indent=4), json.dumps(orig_model_list, ensure_ascii=False, indent=4),
...@@ -36,7 +38,7 @@ def json_md_dump( ...@@ -36,7 +38,7 @@ def json_md_dump(
# Write intermediate results to middle.json # Write intermediate results to middle.json
md_writer.write_string( md_writer.write_string(
f'{pdf_name}_middle.json', f'{pdf_name}_middle.json',
json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), json.dumps(middle_json, ensure_ascii=False, indent=4),
) )
# Write text content results to content_list.json # Write text content results to content_list.json
...@@ -100,45 +102,49 @@ async def pdf_parse_main( ...@@ -100,45 +102,49 @@ async def pdf_parse_main(
output_image_path output_image_path
), FileBasedDataWriter(output_path) ), FileBasedDataWriter(output_path)
ds = PymuDocDataset(pdf_bytes)
# Choose parsing method # Choose parsing method
if parse_method == 'auto': if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_json} if ds.classify() == SupportedPdfParseMethod.OCR:
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) parse_method = 'ocr'
elif parse_method == 'txt': else:
pipe = TXTPipe(pdf_bytes, model_json, image_writer) parse_method = 'txt'
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_json, image_writer) if parse_method not in ['txt', 'ocr']:
else:
logger.error('Unknown parse method, only auto, ocr, txt allowed') logger.error('Unknown parse method, only auto, ocr, txt allowed')
return JSONResponse( return JSONResponse(
content={'error': 'Invalid parse method'}, status_code=400 content={'error': 'Invalid parse method'}, status_code=400
) )
# Execute classification if len(model_json) == 0:
pipe.pipe_classify() if parse_method == 'ocr':
infer_result = ds.apply(doc_analyze, ocr=True)
# If no model data is provided, use built-in model for parsing
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # Parse
else: else:
infer_result = ds.apply(doc_analyze, ocr=False)
else:
infer_result = InferenceResult(model_json, ds)
if len(model_json) == 0 and not model_config.__use_inside_model__:
logger.error('Need model list input') logger.error('Need model list input')
return JSONResponse( return JSONResponse(
content={'error': 'Model list input required'}, status_code=400 content={'error': 'Model list input required'}, status_code=400
) )
if parse_method == 'ocr':
pipe_res = infer_result.pipe_ocr_mode(image_writer)
else:
pipe_res = infer_result.pipe_txt_mode(image_writer)
# Execute parsing
pipe.pipe_parse()
# Save results in text and md format # Save results in text and md format
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none') content_list = pipe_res.get_content_list(image_path_parent, drop_mode='none')
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none') md_content = pipe_res.get_markdown(image_path_parent, drop_mode='none')
if is_json_md_dump: if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content) json_md_dump(infer_result._infer_res, pipe_res._pipe_res, md_writer, pdf_name, content_list, md_content)
data = { data = {
'layout': copy.deepcopy(pipe.model_list), 'layout': copy.deepcopy(infer_result._infer_res),
'info': pipe.pdf_mid_data, 'info': pipe_res._pipe_res,
'content_list': content_list, 'content_list': content_list,
'md_content': md_content, 'md_content': md_content,
} }
......
...@@ -11,9 +11,12 @@ from flask import current_app, url_for ...@@ -11,9 +11,12 @@ from flask import current_app, url_for
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult
from ..extentions import app, db from ..extentions import app, db
from .ext import find_file from .ext import find_file
...@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True ...@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False): def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
try: try:
model_json = [] # model_json传空list使用内置模型解析 model_json = [] # model_json传空list使用内置模型解析
image_writer = FileBasedDataWriter(image_dir)
logger.info(f'is_ocr: {is_ocr}') logger.info(f'is_ocr: {is_ocr}')
parse_method = 'ocr'
ds = PymuDocDataset(pdf_bytes)
# Choose parsing method
if not is_ocr: if not is_ocr:
jso_useful_key = {'_pdf_type': '', 'model_list': model_json} if ds.classify() == SupportedPdfParseMethod.OCR:
image_writer = FileBasedDataWriter(image_dir) parse_method = 'ocr'
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
pipe.pipe_classify()
else:
jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
image_writer = FileBasedDataWriter(image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
"""如果没有传入有效的模型数据,则使用内置model解析"""
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else: else:
logger.error('need model list input') parse_method = 'txt'
exit(1)
pipe.pipe_parse() if parse_method == 'ocr':
pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) infer_result = ds.apply(doc_analyze, ocr=True)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
if parse_method == 'ocr':
pipe_res = infer_result.pipe_ocr_mode(image_writer)
else:
pipe_res = infer_result.pipe_txt_mode(image_writer)
pdf_mid_data = pipe_res._pipe_res
pdf_info_list = pdf_mid_data['pdf_info'] pdf_info_list = pdf_mid_data['pdf_info']
md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix), md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
ensure_ascii=False) ensure_ascii=False)
...@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False): ...@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
except Exception as e: # noqa: F841 except Exception as e: # noqa: F841
logger.error(traceback.format_exc()) logger.error(traceback.format_exc())
def get_bbox_info(data): def get_bbox_info(data):
bbox_info = [] bbox_info = []
for page in data: for page in data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment