refactor: simplify file reading function and improve input validation

02898cdd · myhloli · 7eed5ee9 · 02898cdd · 02898cdd
Commit 02898cdd authored Jun 11, 2025 by myhloli
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 26 deletions

mineru/cli/common.py mineru/cli/common.py +3 -1

projects/gradio_app/app.py projects/gradio_app/app.py +9 -25

No files found.
--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"]
 image_suffixes = [".png", ".jpeg", ".jpg"]
-def read_fn(path: Path):
+def read_fn(path):
+    if not isinstance(path, Path):
+        path = Path(path)
    with open(str(path), "rb") as input_file:
        file_bytes = input_file.read()
        if path.suffix in image_suffixes:

--- a/projects/gradio_app/app.py
+++ b/projects/gradio_app/app.py
@@ -12,16 +12,10 @@ import gradio as gr
 from gradio_pdf import PDF
 from loguru import logger
-from mineru.cli.common import prepare_env, do_parse
+from mineru.cli.common import prepare_env, do_parse, read_fn
-from mineru.data.data_reader_writer import FileBasedDataReader
 from mineru.utils.hash_utils import str_sha256
-def read_fn(path):
-    disk_rw = FileBasedDataReader(os.path.dirname(path))
-    return disk_rw.read(os.path.basename(path))
 def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
    os.makedirs(output_dir, exist_ok=True)
@@ -120,19 +114,6 @@ latex_delimiters = [
 ]
-def init_model():
-    try:
-        pass
-        return 0
-    except Exception as e:
-        logger.exception(e)
-        return -1
-model_init = init_model()
-logger.info(f'model_init: {model_init}')
 with open('header.html', 'r') as file:
    header = file.read()
@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang])
 def to_pdf(file_path):
+    if file_path is None:
+        return None
    pdf_bytes = read_fn(file_path)
    # 将pdfbytes 写入到uuid.pdf中
    # 生成唯一的文件名
@@ -182,14 +165,15 @@ if __name__ == '__main__':
        gr.HTML(header)
        with gr.Row():
            with gr.Column(variant='panel', scale=5):
-                file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
-                max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
                with gr.Row():
-                    with gr.Column():
+                    file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
-                        is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
+                with gr.Row(equal_height=True):
-                    with gr.Column():
+                    with gr.Column(scale=3):
+                        max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
+                    with gr.Column(scale=1):
                        language = gr.Dropdown(all_lang, label='Language', value='ch')
                with gr.Row():
+                    is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
                    formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
                    table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
                with gr.Row():