Commit 4f28604a authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common.py

parent e89b93b9
import copy import copy
import json as json_parse import json as json_parse
import os import os
import re
import click import click
from loguru import logger from loguru import logger
...@@ -16,9 +17,13 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter ...@@ -16,9 +17,13 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.post_proc.remove_spaces_html import remove_extra_spaces_html_txt from magic_pdf.post_proc.remove_spaces_html import remove_extra_spaces_html_txt
def sanitize_filename(filename: str) -> str:
return re.sub(r'[^a-zA-Z0-9_\u4e00-\u9fff-]', '_', filename)
def prepare_env(output_dir, pdf_file_name, method): def prepare_env(output_dir, pdf_file_name, method):
local_parent_dir = os.path.join(output_dir, pdf_file_name, method) # logger.info(f'pdf_file_name:{pdf_file_name}')
pdf_file_name = sanitize_filename(pdf_file_name)
local_parent_dir = os.path.join(output_dir, pdf_file_name)
local_image_dir = os.path.join(str(local_parent_dir), 'images') local_image_dir = os.path.join(str(local_parent_dir), 'images')
local_md_dir = local_parent_dir local_md_dir = local_parent_dir
...@@ -116,6 +121,7 @@ def do_parse( ...@@ -116,6 +121,7 @@ def do_parse(
md_make_mode=f_make_md_mode) md_make_mode=f_make_md_mode)
try: try:
pdf_file_name = sanitize_filename(pdf_file_name)
txt_file = f'{pdf_file_name}.txt' txt_file = f'{pdf_file_name}.txt'
md_writer.write( md_writer.write(
content=md_content, content=md_content,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment