Commit 4eaa85fd authored by myhloli's avatar myhloli
Browse files

refactor: update make mode constants to improve content list handling

parent c01b780b
...@@ -260,14 +260,14 @@ def union_make(pdf_info_dict: list, ...@@ -260,14 +260,14 @@ def union_make(pdf_info_dict: list,
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path) page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.CONTENT_LIST:
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx) para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.CONTENT_LIST:
return output_content return output_content
else: else:
logger.error(f"Unsupported make mode: {make_mode}") logger.error(f"Unsupported make mode: {make_mode}")
......
...@@ -186,14 +186,14 @@ def union_make(pdf_info_dict: list, ...@@ -186,14 +186,14 @@ def union_make(pdf_info_dict: list,
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path) page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.CONTENT_LIST:
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx) para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.CONTENT_LIST:
return output_content return output_content
return None return None
......
...@@ -143,7 +143,7 @@ def do_parse( ...@@ -143,7 +143,7 @@ def do_parse(
if f_dump_content_list: if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir) content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string( md_writer.write_string(
f"{pdf_file_name}_content_list.json", f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4), json.dumps(content_list, ensure_ascii=False, indent=4),
...@@ -200,7 +200,7 @@ def do_parse( ...@@ -200,7 +200,7 @@ def do_parse(
if f_dump_content_list: if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir) content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string( md_writer.write_string(
f"{pdf_file_name}_content_list.json", f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4), json.dumps(content_list, ensure_ascii=False, indent=4),
......
...@@ -42,7 +42,7 @@ class CategoryId: ...@@ -42,7 +42,7 @@ class CategoryId:
class MakeMode: class MakeMode:
MM_MD = 'mm_markdown' MM_MD = 'mm_markdown'
NLP_MD = 'nlp_markdown' NLP_MD = 'nlp_markdown'
STANDARD_FORMAT = 'standard_format' CONTENT_LIST = 'content_list'
class ModelPath: class ModelPath:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment