Commit 114161e0 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common.py

parent d61fddef
......@@ -73,9 +73,6 @@ def do_parse(
image_writer, md_writer = DiskReaderWriter(
local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
# logger.info(f'model_list:{model_list}')
# logger.info(f'local_image_dir:::{local_image_dir}')
# logger.info(f'image_dir:::{image_dir}')
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
......@@ -96,7 +93,6 @@ def do_parse(
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
# logger.info(f'执行pipe.pipe_analyze()之后的pipe.model_list:{pipe.model_list}')
orig_model_list = copy.deepcopy(pipe.model_list)
else:
logger.error('need model list input')
......@@ -106,11 +102,6 @@ def do_parse(
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# if f_draw_span_bbox:
# draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# if f_draw_model_bbox:
# drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(config_path,local_image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
......@@ -121,55 +112,25 @@ def do_parse(
path=f'{pdf_file_name}.txt',
mode=AbsReaderWriter.MODE_TXT,
)
except Exception as e:
logger.info(f'{pdf_file_name}导出txt文件失败,具体原因为:\n{e}')
filepath = os.path.join(str(local_md_dir), f'{pdf_file_name}.txt')
logger.info(f'txt文件保存在filepath:{filepath}')
remove_empty_lines_from_file(filepath)
# if f_dump_md:
# md_writer.write(
# content=md_content,
# path=f'{pdf_file_name}.md',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_middle_json:
# md_writer.write(
# content=json_parse.dumps(pipe.pdf_mid_data,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_middle.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_model_json:
# md_writer.write(
# content=json_parse.dumps(orig_model_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_model.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
if f_dump_orig_pdf:
md_writer.write(
content=pdf_bytes,
path=f'{pdf_file_name}_origin.pdf',
mode=AbsReaderWriter.MODE_BIN,
)
#
# content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
# if f_dump_content_list:
# md_writer.write(
# content=json_parse.dumps(content_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_content_list.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
filepath = os.path.join(str(local_md_dir), f'{pdf_file_name}.txt')
logger.info(f'txt文件保存在filepath:{filepath}')
remove_empty_lines_from_file(filepath)
logger.info(f'local output dir is {local_md_dir}')
if f_dump_orig_pdf:
md_writer.write(
content=pdf_bytes,
path=f'{pdf_file_name}_origin.pdf',
mode=AbsReaderWriter.MODE_BIN,
)
logger.info(f'local output dir is {local_md_dir}')
return filepath
except Exception as e:
logger.error(f'{pdf_file_name}导出txt文件失败,具体原因为:\n{e}')
return None
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment