Commit b9336031 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update pdf_parse_union_core_v2.py

parent 28e8f8b8
...@@ -381,7 +381,7 @@ def revert_group_blocks(blocks): ...@@ -381,7 +381,7 @@ def revert_group_blocks(blocks):
return new_blocks return new_blocks
def parse_page_core(ocr_status,config_path,local_image_dir, def parse_page_core(config_path,local_image_dir,
page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
): ):
need_drop = False need_drop = False
...@@ -428,7 +428,7 @@ def parse_page_core(ocr_status,config_path,local_image_dir, ...@@ -428,7 +428,7 @@ def parse_page_core(ocr_status,config_path,local_image_dir,
"""删除重叠spans中较小的那些""" """删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
"""对image和table截图""" """对image和table截图"""
spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir, spans = ocr_cut_image_and_table(config_path,local_image_dir,
spans, page_doc, page_id, pdf_bytes_md5, imageWriter spans, page_doc, page_id, pdf_bytes_md5, imageWriter
) )
...@@ -524,7 +524,7 @@ def parse_page_core(ocr_status,config_path,local_image_dir, ...@@ -524,7 +524,7 @@ def parse_page_core(ocr_status,config_path,local_image_dir,
return page_info return page_info
def pdf_parse_union(ocr_status,config_path,local_image_dir, def pdf_parse_union(config_path,local_image_dir,
dataset: Dataset, dataset: Dataset,
model_list, model_list,
imageWriter, imageWriter,
...@@ -568,7 +568,7 @@ def pdf_parse_union(ocr_status,config_path,local_image_dir, ...@@ -568,7 +568,7 @@ def pdf_parse_union(ocr_status,config_path,local_image_dir,
"""解析pdf中的每一页""" """解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id: if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core( page_info = parse_page_core(
ocr_status,config_path,local_image_dir,page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode config_path,local_image_dir,page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
) )
else: else:
page_info = page.get_page_info() page_info = page.get_page_info()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment