Merge branch 'opendatalab:dev' into dev

9d689790 · linfeng · GitHub · bcef0868 · fb383ba6 · 9d689790
Unverified Commit 9d689790 authored Sep 19, 2024 by linfeng Committed by GitHub Sep 19, 2024
20 changed files
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
    PIP_TXT = "txt"

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
+                 start_page_id=0, end_page_id=None, lang=None):
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
@@ -25,6 +25,7 @@ class AbsPipe(ABC):
        self.is_debug = is_debug
        self.start_page_id = start_page_id
        self.end_page_id = end_page_id
+        self.lang = lang
    
    def get_compress_pdf_mid_data(self):
        return JsonCompressor.compress_json(self.pdf_mid_data)
@@ -94,7 +95,9 @@ class AbsPipe(ABC):
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
+        parse_type = pdf_mid_data["_parse_type"]
+        lang = pdf_mid_data.get("_lang", None)
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
        return content_list

    @staticmethod
@@ -104,7 +107,9 @@ class AbsPipe(ABC):
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
+        parse_type = pdf_mid_data["_parse_type"]
+        lang = pdf_mid_data.get("_lang", None)
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
        return md_content


--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf
 class OCRPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
+                 start_page_id=0, end_page_id=None, lang=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)

    def pipe_classify(self):
        pass

    def pipe_analyze(self):
        self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang)

    def pipe_parse(self):
        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf
 class TXTPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
+                 start_page_id=0, end_page_id=None, lang=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)

    def pipe_classify(self):
        pass

    def pipe_analyze(self):
        self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang)

    def pipe_parse(self):
        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 class UNIPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
+                 start_page_id=0, end_page_id=None, lang=None):
        self.pdf_type = jso_useful_key["_pdf_type"]
-        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
        if len(self.model_list) == 0:
            self.input_model_is_empty = True
        else:
@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe):
    def pipe_analyze(self):
        if self.pdf_type == self.PIP_TXT:
            self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)
        elif self.pdf_type == self.PIP_OCR:
            self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_parse(self):
        if self.pdf_type == self.PIP_TXT:
            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
-                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                                lang=self.lang)
        elif self.pdf_type == self.PIP_OCR:
            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                              is_debug=self.is_debug,
-                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                              lang=self.lang)

-    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
        logger.info("uni_pipe mk content list finished")
        return result

--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
@@ -2,13 +2,13 @@ model:
  arch: unimernet
  model_type: unimernet
  model_config:
-    model_name: ./models
-    max_seq_len: 1024
-    length_aware: False
+    model_name: ./models/unimernet_base
+    max_seq_len: 1536
+
  load_pretrained: True
-  pretrained: ./models/pytorch_model.bin
+  pretrained: './models/unimernet_base/pytorch_model.pth'
  tokenizer_config:
-    path: ./models
+    path: ./models/unimernet_base

 datasets:
  formula_rec_eval:
@@ -18,7 +18,7 @@ datasets:
        image_size:
          - 192
          - 672
-   
+
 run:
  runner: runner_iter
  task: unimernet_train
@@ -43,4 +43,4 @@ run:
  distributed_type: ddp  # or fsdp when train llm

  generate_cfg:
-    temperature: 0.0
+    temperature: 0.0
\ No newline at end of file
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -10,6 +10,6 @@ config:
 weights:
  layout: Layout/model_final.pth
  mfd: MFD/weights.pt
-  mfr: MFR/UniMERNet
+  mfr: MFR/unimernet_base
  struct_eqtable: TabRec/StructEqTable
  TableMaster: TabRec/TableMaster
\ No newline at end of file
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -44,6 +44,18 @@ auto: automatically choose the best method for parsing pdf from ocr and txt.
 without method specified, auto will be used by default.""",
    default='auto',
 )
+@click.option(
+    '-l',
+    '--lang',
+    'lang',
+    type=str,
+    help="""
+    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
+    You should input "Abbreviation" with language form url:
+    https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
+    """,
+    default=None,
+)
 @click.option(
    '-d',
    '--debug',
@@ -68,7 +80,7 @@ without method specified, auto will be used by default.""",
    help='The ending page for PDF parsing, beginning from 0.',
    default=None,
 )
-def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
+def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
@@ -90,6 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
                debug_able,
                start_page_id=start_page_id,
                end_page_id=end_page_id,
+                lang=lang
            )

        except Exception as e:

--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -44,9 +44,10 @@ def do_parse(
    f_draw_model_bbox=False,
    start_page_id=0,
    end_page_id=None,
+    lang=None,
 ):
    if debug_able:
-        logger.warning("debug mode is on")
+        logger.warning('debug mode is on')
        f_dump_content_list = True
        f_draw_model_bbox = True

@@ -61,13 +62,13 @@ def do_parse(
    if parse_method == 'auto':
        jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
-                       start_page_id=start_page_id, end_page_id=end_page_id)
+                       start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
    elif parse_method == 'txt':
        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
-                       start_page_id=start_page_id, end_page_id=end_page_id)
+                       start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
    elif parse_method == 'ocr':
        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
-                       start_page_id=start_page_id, end_page_id=end_page_id)
+                       start_page_id=start_page_id, end_page_id=end_page_id, lang=lang)
    else:
        logger.error('unknown parse method')
        exit(1)

--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -26,7 +26,7 @@ PARSE_TYPE_OCR = "ocr"


 def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
-                  start_page_id=0, end_page_id=None,
+                  start_page_id=0, end_page_id=None, lang=None,
                  *args, **kwargs):
    """
    解析文本类pdf
@@ -44,11 +44,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_version_name"] = __version__

+    if lang is not None:
+        pdf_info_dict["_lang"] = lang
+
    return pdf_info_dict


 def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
-                  start_page_id=0, end_page_id=None,
+                  start_page_id=0, end_page_id=None, lang=None,
                  *args, **kwargs):
    """
    解析ocr类pdf
@@ -66,12 +69,15 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit

    pdf_info_dict["_version_name"] = __version__

+    if lang is not None:
+        pdf_info_dict["_lang"] = lang
+
    return pdf_info_dict


 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
                    input_model_is_empty: bool = False,
-                    start_page_id=0, end_page_id=None,
+                    start_page_id=0, end_page_id=None, lang=None,
                    *args, **kwargs):
    """
    ocr和文本混合的pdf，全部解析出来
@@ -95,9 +101,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
        if input_model_is_empty:
-            pdf_models = doc_analyze(pdf_bytes, ocr=True,
+            pdf_models = doc_analyze(pdf_bytes,
+                                     ocr=True,
                                     start_page_id=start_page_id,
-                                     end_page_id=end_page_id)
+                                     end_page_id=end_page_id,
+                                     lang=lang)
        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
        if pdf_info_dict is None:
            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
@@ -108,4 +116,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr

    pdf_info_dict["_version_name"] = __version__

+    if lang is not None:
+        pdf_info_dict["_lang"] = lang
+
    return pdf_info_dict
--- a/projects/README.md
+++ b/projects/README.md
@@ -3,4 +3,6 @@
 ## Project List

 - [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
+- [gradio_app](./gradio_app/README.md): Build a web app based on gradio
+

--- a/projects/README_zh-CN.md
+++ b/projects/README_zh-CN.md
@@ -3,3 +3,5 @@
 ## 项目列表

 - [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
+- [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
+
--- a/projects/gradio_app/README.md
+++ b/projects/gradio_app/README.md
+## Installation
+
+MinerU(>=0.8.0)
+ > If you already have a functioning MinerU environment, you can skip this step.
+ > 
+[Deploy in CPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#quick-cpu-demo)
+
+[Deploy in GPU environment](https://github.com/opendatalab/MinerU?tab=readme-ov-file#using-gpu)
+
+Third-party Software
+
+```bash
+pip install gradio gradio-pdf
+```
+
+## Start Gradio App
+
+```bash
+python app.py
+```
+
+## Use Gradio App
+
+Access http://127.0.0.1:7860 in your web browser
\ No newline at end of file
--- a/projects/gradio_app/README_zh-CN.md
+++ b/projects/gradio_app/README_zh-CN.md
+## 安装
+
+MinerU(>=0.8.0)
+ >如已有正常运行的MinerU环境则可以跳过此步骤
+> 
+[在CPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8cpu%E5%BF%AB%E9%80%9F%E4%BD%93%E9%AA%8C)
+
+[在GPU环境部署](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8gpu)
+
+第三方软件
+
+```bash
+pip install gradio gradio-pdf
+```
+
+## 启动gradio应用
+
+```bash
+python app.py
+```
+
+## 使用gradio应用
+
+在浏览器中访问 http://127.0.0.1:7860
\ No newline at end of file
--- a/app.py
+++ b/app.py
@@ -14,8 +14,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.tools.common import do_parse, prepare_env

-os.system("pip install gradio")
-os.system("pip install gradio-pdf")
 import gradio as gr
 from gradio_pdf import PDF

@@ -25,13 +23,16 @@ def read_fn(path):
    return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)


-def parse_pdf(doc_path, output_dir, end_page_id):
+def parse_pdf(doc_path, output_dir, end_page_id, is_ocr):
    os.makedirs(output_dir, exist_ok=True)

    try:
        file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
        pdf_data = read_fn(doc_path)
-        parse_method = "auto"
+        if is_ocr:
+            parse_method = "ocr"
+        else:
+            parse_method = "auto"
        local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
        do_parse(
            output_dir,
@@ -92,9 +93,9 @@ def replace_image_with_base64(markdown_text, image_dir_path):
    return re.sub(pattern, replace, markdown_text)


-def to_markdown(file_path, end_pages):
+def to_markdown(file_path, end_pages, is_ocr):
    # 获取识别的md文件以及压缩包文件路径
-    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
+    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr)
    archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
    zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
    if zip_archive_success == 0:
@@ -111,14 +112,6 @@ def to_markdown(file_path, end_pages):
    return md_content, txt_content, archive_zip_path, new_pdf_path


-# def show_pdf(file_path):
-#     with open(file_path, "rb") as f:
-#         base64_pdf = base64.b64encode(f.read()).decode('utf-8')
-#     pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
-#                   f'width="100%" height="1000" type="application/pdf">'
-#     return pdf_display
-
-
 latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
                    {"left": '$', "right": '$', "display": False}]

@@ -141,16 +134,29 @@ model_init = init_model()
 logger.info(f"model_init: {model_init}")


+with open("header.html", "r") as file:
+    header = file.read()
+
+
 if __name__ == "__main__":
    with gr.Blocks() as demo:
+        gr.HTML(header)
        with gr.Row():
            with gr.Column(variant='panel', scale=5):
                pdf_show = gr.Markdown()
                max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
                with gr.Row() as bu_flow:
+                    is_ocr = gr.Checkbox(label="Force enable OCR")
                    change_bu = gr.Button("Convert")
                    clear_bu = gr.ClearButton([pdf_show], value="Clear")
                pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
+                with gr.Accordion("Examples:"):
+                    example_root = os.path.join(os.path.dirname(__file__), "examples")
+                    gr.Examples(
+                        examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
+                                  _.endswith("pdf")],
+                        inputs=pdf_show,
+                    )

            with gr.Column(variant='panel', scale=5):
                output_file = gr.File(label="convert result", interactive=False)
@@ -160,8 +166,7 @@ if __name__ == "__main__":
                                         latex_delimiters=latex_delimiters, line_breaks=True)
                    with gr.Tab("Markdown text"):
                        md_text = gr.TextArea(lines=45, show_copy_button=True)
-        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
-        clear_bu.add([md, pdf_show, md_text, output_file])
-
-    demo.launch()
+        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
+        clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])

+    demo.launch()
\ No newline at end of file
--- a/projects/gradio_app/examples/academic_paper_formula.pdf
+++ b/projects/gradio_app/examples/academic_paper_formula.pdf
--- a/projects/gradio_app/examples/academic_paper_img_formula.pdf
+++ b/projects/gradio_app/examples/academic_paper_img_formula.pdf
--- a/projects/gradio_app/examples/garbled_formula.pdf
+++ b/projects/gradio_app/examples/garbled_formula.pdf
--- a/projects/gradio_app/examples/garbled_formula2.pdf
+++ b/projects/gradio_app/examples/garbled_formula2.pdf
--- a/projects/gradio_app/examples/garbled_img_formula.pdf
+++ b/projects/gradio_app/examples/garbled_img_formula.pdf
--- a/projects/gradio_app/examples/scanned.pdf
+++ b/projects/gradio_app/examples/scanned.pdf