"src/array/cuda/sddmm_hetero_coo.hip" did not exist on "8ac27dad1a20b4228419e64746ae9110416e34ee"
Commit 3334157f authored by myhloli's avatar myhloli
Browse files

refactor: clean up unused OCR area calculation and update demo PDF path

parent 236a6033
......@@ -230,19 +230,6 @@ class BatchAnalyze:
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
new_image, _lang)
# if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
# # ocr_result_list中所有bbox的面积之和
# ocr_res_area = sum(
# get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
# # 求ocr_res_area和res的面积的比值
# res_area = get_coords_and_area(res)[4]
# if res_area > 0:
# ratio = ocr_res_area / res_area
# if ratio > 0.25:
# res["category_id"] = 1
# else:
# continue
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
# 表格识别 table recognition
......
......@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""获取所有的spans信息"""
spans = magic_model.get_all_spans()
"""某些图可能是文本块,通过简单的规则判断一下"""
if len(maybe_text_image_blocks) > 0:
for block in maybe_text_image_blocks:
span_in_block_list = []
......@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
if ratio > 0.25 and ocr:
# 移除block的group_id
block.pop('group_id', None)
# 符合文本图的条件就把块加入到文本块列表中
text_blocks.append(block)
else:
# 如果不符合文本图的条件,就把块加回到图片块列表中
img_body_blocks.append(block)
else:
img_body_blocks.append(block)
......
......@@ -215,9 +215,10 @@ def do_parse(
if __name__ == "__main__":
pdf_path = "../../demo/pdfs/demo2.pdf"
with open(pdf_path, "rb") as f:
pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
# pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
try:
do_parse("./output", [Path(pdf_path).stem], [f.read()],["ch"], end_page_id=20,)
do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
except Exception as e:
logger.exception(e)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment