Commit 02898cdd authored by myhloli's avatar myhloli
Browse files

refactor: simplify file reading function and improve input validation

parent 7eed5ee9
......@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
def read_fn(path: Path):
def read_fn(path):
if not isinstance(path, Path):
path = Path(path)
with open(str(path), "rb") as input_file:
file_bytes = input_file.read()
if path.suffix in image_suffixes:
......
......@@ -12,16 +12,10 @@ import gradio as gr
from gradio_pdf import PDF
from loguru import logger
from mineru.cli.common import prepare_env, do_parse
from mineru.data.data_reader_writer import FileBasedDataReader
from mineru.cli.common import prepare_env, do_parse, read_fn
from mineru.utils.hash_utils import str_sha256
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True)
......@@ -120,19 +114,6 @@ latex_delimiters = [
]
def init_model():
try:
pass
return 0
except Exception as e:
logger.exception(e)
return -1
model_init = init_model()
logger.info(f'model_init: {model_init}')
with open('header.html', 'r') as file:
header = file.read()
......@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang])
def to_pdf(file_path):
if file_path is None:
return None
pdf_bytes = read_fn(file_path)
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
......@@ -182,14 +165,15 @@ if __name__ == '__main__':
gr.HTML(header)
with gr.Row():
with gr.Column(variant='panel', scale=5):
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Row():
with gr.Column():
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
with gr.Column():
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
with gr.Row(equal_height=True):
with gr.Column(scale=3):
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Column(scale=1):
language = gr.Dropdown(all_lang, label='Language', value='ch')
with gr.Row():
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
with gr.Row():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment