Commit 3334157f authored by myhloli's avatar myhloli
Browse files

refactor: clean up unused OCR area calculation and update demo PDF path

parent 236a6033
...@@ -230,19 +230,6 @@ class BatchAnalyze: ...@@ -230,19 +230,6 @@ class BatchAnalyze:
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
new_image, _lang) new_image, _lang)
# if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
# # ocr_result_list中所有bbox的面积之和
# ocr_res_area = sum(
# get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
# # 求ocr_res_area和res的面积的比值
# res_area = get_coords_and_area(res)[4]
# if res_area > 0:
# ratio = ocr_res_area / res_area
# if ratio > 0.25:
# res["category_id"] = 1
# else:
# continue
ocr_res_list_dict['layout_res'].extend(ocr_result_list) ocr_res_list_dict['layout_res'].extend(ocr_result_list)
# 表格识别 table recognition # 表格识别 table recognition
......
...@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""获取所有的spans信息""" """获取所有的spans信息"""
spans = magic_model.get_all_spans() spans = magic_model.get_all_spans()
"""某些图可能是文本块,通过简单的规则判断一下"""
if len(maybe_text_image_blocks) > 0: if len(maybe_text_image_blocks) > 0:
for block in maybe_text_image_blocks: for block in maybe_text_image_blocks:
span_in_block_list = [] span_in_block_list = []
...@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
if ratio > 0.25 and ocr: if ratio > 0.25 and ocr:
# 移除block的group_id # 移除block的group_id
block.pop('group_id', None) block.pop('group_id', None)
# 符合文本图的条件就把块加入到文本块列表中
text_blocks.append(block) text_blocks.append(block)
else: else:
# 如果不符合文本图的条件,就把块加回到图片块列表中
img_body_blocks.append(block) img_body_blocks.append(block)
else: else:
img_body_blocks.append(block) img_body_blocks.append(block)
......
...@@ -215,9 +215,10 @@ def do_parse( ...@@ -215,9 +215,10 @@ def do_parse(
if __name__ == "__main__": if __name__ == "__main__":
pdf_path = "../../demo/pdfs/demo2.pdf" pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
with open(pdf_path, "rb") as f: # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
try: try:
do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"], end_page_id=20,) do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment