Commit a024c30f authored by myhloli's avatar myhloli
Browse files

feat(ocr): implement dynamic OCR processing for text spans with low contrast

- Comment out OCR model initialization and execution for low-contrast spans
- Add batch OCR processing for collected image spans
- Adjust contrast threshold for OCR processing
- Remove unnecessary OCR processing for high-contrast spans
- Implement more efficient OCR workflow by processing multiple spans at once
parent 41f1fb8a
...@@ -193,7 +193,7 @@ def calculate_contrast(img, img_mode) -> float: ...@@ -193,7 +193,7 @@ def calculate_contrast(img, img_mode) -> float:
std_dev = np.std(gray_img) std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误) # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6) contrast = std_dev / (mean_value + 1e-6)
# logger.info(f"contrast: {contrast}") # logger.debug(f"contrast: {contrast}")
return round(contrast, 2) return round(contrast, 2)
# @measure_time # @measure_time
...@@ -286,33 +286,39 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -286,33 +286,39 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if len(need_ocr_spans) > 0: if len(need_ocr_spans) > 0:
# 初始化ocr模型 # 初始化ocr模型
atom_model_manager = AtomModelSingleton() # atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model( # ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr', # atom_model_name='ocr',
ocr_show_log=False, # ocr_show_log=False,
det_db_box_thresh=0.3, # det_db_box_thresh=0.3,
lang=lang # lang=lang
) # )
for span in need_ocr_spans: for span in need_ocr_spans:
# 对span的bbox截图再ocr # 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2') span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
# 计算span的对比度,低于0.20的span不进行ocr # 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img, img_mode='bgr') <= 0.20: if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
spans.remove(span) spans.remove(span)
continue continue
# pass
ocr_res = ocr_model.ocr(span_img, det=False) span['content'] = ''
if ocr_res and len(ocr_res) > 0: span['score'] = 1
if len(ocr_res[0]) > 0: span['np_img'] = span_img
ocr_text, ocr_score = ocr_res[0][0]
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
if ocr_score > 0.5 and len(ocr_text) > 0: # ocr_res = ocr_model.ocr(span_img, det=False)
span['content'] = ocr_text # if ocr_res and len(ocr_res) > 0:
span['score'] = float(round(ocr_score, 2)) # if len(ocr_res[0]) > 0:
else: # ocr_text, ocr_score = ocr_res[0][0]
spans.remove(span) # # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
# if ocr_score > 0.5 and len(ocr_text) > 0:
# span['content'] = ocr_text
# span['score'] = float(round(ocr_score, 2))
# else:
# spans.remove(span)
return spans return spans
...@@ -952,7 +958,47 @@ def pdf_parse_union( ...@@ -952,7 +958,47 @@ def pdf_parse_union(
) )
pdf_info_dict[f'page_{page_id}'] = page_info pdf_info_dict[f'page_{page_id}'] = page_info
# PerformanceStats.print_stats() need_ocr_list = []
img_crop_list = []
text_block_list = []
for pange_id, page_info in pdf_info_dict.items():
for block in page_info['preproc_blocks']:
if block['type'] in ['table', 'image']:
for sub_block in block['blocks']:
if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
text_block_list.append(sub_block)
elif block['type'] in ['text', 'title']:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(span['np_img'])
span.pop('np_img')
if len(img_crop_list) > 0:
# Get OCR results for this language's images
atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3,
lang=lang
)
rec_start = time.time()
ocr_res_list = ocr_model.ocr(img_crop_list, det=False)[0]
# Verify we have matching counts
assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
# Process OCR results for this language
for index, span in enumerate(need_ocr_list):
ocr_text, ocr_score = ocr_res_list[index]
span['content'] = ocr_text
span['score'] = float(round(ocr_score, 2))
rec_time = time.time() - rec_start
logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
"""分段""" """分段"""
para_split(pdf_info_dict) para_split(pdf_info_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment