"git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "b036725166ffc2803c0d4156f7abd859b45e92b3"
Commit 7d27726e authored by myhloli's avatar myhloli
Browse files

refactor: improve file naming logic and enhance unique filename generation

parent 83f7c26f
...@@ -20,7 +20,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_e ...@@ -20,7 +20,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_e
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
try: try:
file_name = f'{str(Path(doc_path).stem)}_{time.time()}' file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
if is_ocr: if is_ocr:
parse_method = 'ocr' parse_method = 'ocr'
...@@ -142,13 +142,21 @@ all_lang = [] ...@@ -142,13 +142,21 @@ all_lang = []
all_lang.extend([*other_lang, *add_lang]) all_lang.extend([*other_lang, *add_lang])
def safe_stem(file_path):
stem = Path(file_path).stem
# 只保留字母、数字、下划线和点,其他字符替换为下划线
return re.sub(r'[^\w.]', '_', stem)
def to_pdf(file_path): def to_pdf(file_path):
if file_path is None: if file_path is None:
return None return None
pdf_bytes = read_fn(file_path) pdf_bytes = read_fn(file_path)
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名 # unique_filename = f'{uuid.uuid4()}.pdf'
unique_filename = f'{uuid.uuid4()}.pdf' unique_filename = f'{safe_stem(file_path)}.pdf'
# 构建完整的文件路径 # 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
...@@ -168,7 +176,7 @@ if __name__ == '__main__': ...@@ -168,7 +176,7 @@ if __name__ == '__main__':
with gr.Row(): with gr.Row():
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg']) file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
with gr.Row(equal_height=True): with gr.Row(equal_height=True):
with gr.Column(scale=3): with gr.Column(scale=4):
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages') max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Column(scale=1): with gr.Column(scale=1):
language = gr.Dropdown(all_lang, label='Language', value='ch') language = gr.Dropdown(all_lang, label='Language', value='ch')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment