Commit 02898cdd authored by myhloli's avatar myhloli
Browse files

refactor: simplify file reading function and improve input validation

parent 7eed5ee9
...@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"] ...@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"] image_suffixes = [".png", ".jpeg", ".jpg"]
def read_fn(path: Path): def read_fn(path):
if not isinstance(path, Path):
path = Path(path)
with open(str(path), "rb") as input_file: with open(str(path), "rb") as input_file:
file_bytes = input_file.read() file_bytes = input_file.read()
if path.suffix in image_suffixes: if path.suffix in image_suffixes:
......
...@@ -12,16 +12,10 @@ import gradio as gr ...@@ -12,16 +12,10 @@ import gradio as gr
from gradio_pdf import PDF from gradio_pdf import PDF
from loguru import logger from loguru import logger
from mineru.cli.common import prepare_env, do_parse from mineru.cli.common import prepare_env, do_parse, read_fn
from mineru.data.data_reader_writer import FileBasedDataReader
from mineru.utils.hash_utils import str_sha256 from mineru.utils.hash_utils import str_sha256
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language): def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
...@@ -120,19 +114,6 @@ latex_delimiters = [ ...@@ -120,19 +114,6 @@ latex_delimiters = [
] ]
def init_model():
try:
pass
return 0
except Exception as e:
logger.exception(e)
return -1
model_init = init_model()
logger.info(f'model_init: {model_init}')
with open('header.html', 'r') as file: with open('header.html', 'r') as file:
header = file.read() header = file.read()
...@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang]) ...@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang])
def to_pdf(file_path): def to_pdf(file_path):
if file_path is None:
return None
pdf_bytes = read_fn(file_path) pdf_bytes = read_fn(file_path)
# 将pdfbytes 写入到uuid.pdf中 # 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名 # 生成唯一的文件名
...@@ -182,14 +165,15 @@ if __name__ == '__main__': ...@@ -182,14 +165,15 @@ if __name__ == '__main__':
gr.HTML(header) gr.HTML(header)
with gr.Row(): with gr.Row():
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Row(): with gr.Row():
with gr.Column(): file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
is_ocr = gr.Checkbox(label='Force enable OCR', value=False) with gr.Row(equal_height=True):
with gr.Column(): with gr.Column(scale=3):
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Column(scale=1):
language = gr.Dropdown(all_lang, label='Language', value='ch') language = gr.Dropdown(all_lang, label='Language', value='ch')
with gr.Row(): with gr.Row():
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True) formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True) table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
with gr.Row(): with gr.Row():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment