Commit 7d2dfc80 authored by liukaiwen's avatar liukaiwen
Browse files

Merge branch 'dev' into dev-table-model-update

parents a0eff3be 6d571e2e
Dataset Api
------------------
.. autoclass:: magic_pdf.data.dataset.PageableData
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.Dataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.ImageDataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.PymuDocDataset
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.dataset.Doc
:members:
:inherited-members:
read_api Api
------------------
.. automodule:: magic_pdf.data.read_api
:members:
:inherited-members:
...@@ -24,3 +24,15 @@ Welcome to the MinerU Documentation ...@@ -24,3 +24,15 @@ Welcome to the MinerU Documentation
<a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a> <a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a> <a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p> </p>
API Reference
-------------
If you are looking for information on a specific function, class or
method, this part of the documentation is for you.
.. toctree::
:maxdepth: 2
api
boto3>=1.28.43
loguru>=0.6.0
myst-parser myst-parser
Pillow==8.4.0
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
sphinx sphinx
sphinx-argparse sphinx-argparse
sphinx-book-theme sphinx-book-theme
......
...@@ -6,5 +6,4 @@ ...@@ -6,5 +6,4 @@
- [gradio_app](./gradio_app/README.md): Build a web app based on gradio - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
- [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version - [web_demo](./web_demo/README.md): MinerU online [demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) localized deployment version
- [web_api](./web_api/README.md): Web API Based on FastAPI - [web_api](./web_api/README.md): Web API Based on FastAPI
- [multi_gpu](./multi_gpu/README.md): Multi-GPU parallel processing based on LitServe
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
- [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本 - [web_demo](./web_demo/README_zh-CN.md): MinerU在线[demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/)本地化部署版本
- [web_api](./web_api/README.md): 基于 FastAPI 的 Web API - [web_api](./web_api/README.md): 基于 FastAPI 的 Web API
- [multi_gpu](./multi_gpu/README.md): 基于 LitServe 的多 GPU 并行处理
...@@ -3,10 +3,12 @@ ...@@ -3,10 +3,12 @@
import base64 import base64
import os import os
import time import time
import uuid
import zipfile import zipfile
from pathlib import Path from pathlib import Path
import re import re
import pymupdf
from loguru import logger from loguru import logger
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
...@@ -23,7 +25,7 @@ def read_fn(path): ...@@ -23,7 +25,7 @@ def read_fn(path):
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr): def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
try: try:
...@@ -42,6 +44,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr): ...@@ -42,6 +44,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
parse_method, parse_method,
False, False,
end_page_id=end_page_id, end_page_id=end_page_id,
layout_model=layout_mode,
formula_enable=formula_enable,
table_enable=table_enable,
lang=language,
) )
return local_md_dir, file_name return local_md_dir, file_name
except Exception as e: except Exception as e:
...@@ -93,9 +99,10 @@ def replace_image_with_base64(markdown_text, image_dir_path): ...@@ -93,9 +99,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
return re.sub(pattern, replace, markdown_text) return re.sub(pattern, replace, markdown_text)
def to_markdown(file_path, end_pages, is_ocr): def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
# 获取识别的md文件以及压缩包文件路径 # 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr) local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
layout_mode, formula_enable, table_enable, language)
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip") archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0: if zip_archive_success == 0:
...@@ -138,24 +145,71 @@ with open("header.html", "r") as file: ...@@ -138,24 +145,71 @@ with open("header.html", "r") as file:
header = file.read() header = file.read()
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
'sa', 'bgc'
]
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = [""]
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
def to_pdf(file_path):
with pymupdf.open(file_path) as f:
if f.is_pdf:
return file_path
else:
pdf_bytes = f.convert_to_pdf()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename = f"{uuid.uuid4()}.pdf"
# 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
# 将字节数据写入文件
with open(tmp_file_path, 'wb') as tmp_pdf_file:
tmp_pdf_file.write(pdf_bytes)
return tmp_file_path
if __name__ == "__main__": if __name__ == "__main__":
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.HTML(header) gr.HTML(header)
with gr.Row(): with gr.Row():
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
pdf_show = gr.Markdown() file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages") max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
with gr.Row() as bu_flow: with gr.Row():
is_ocr = gr.Checkbox(label="Force enable OCR") layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
language = gr.Dropdown(all_lang, label="Language", value="")
with gr.Row():
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
with gr.Row():
change_bu = gr.Button("Convert") change_bu = gr.Button("Convert")
clear_bu = gr.ClearButton([pdf_show], value="Clear") clear_bu = gr.ClearButton(value="Clear")
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800) pdf_show = PDF(label="PDF preview", interactive=True, height=800)
with gr.Accordion("Examples:"): with gr.Accordion("Examples:"):
example_root = os.path.join(os.path.dirname(__file__), "examples") example_root = os.path.join(os.path.dirname(__file__), "examples")
gr.Examples( gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith("pdf")], _.endswith("pdf")],
inputs=pdf_show, inputs=pdf_show
) )
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
...@@ -166,7 +220,9 @@ if __name__ == "__main__": ...@@ -166,7 +220,9 @@ if __name__ == "__main__":
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab("Markdown text"): with gr.Tab("Markdown text"):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show]) file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr]) change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
demo.launch() demo.launch(server_name="0.0.0.0")
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment