Commit f35a6c08 authored by myhloli's avatar myhloli
Browse files

refactor(filter): remove unused text layout analysis for PDF classification

parent 9f18ca20
...@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod: ...@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
pdf_meta['image_info_per_page'], pdf_meta['image_info_per_page'],
pdf_meta['text_len_per_page'], pdf_meta['text_len_per_page'],
pdf_meta['imgs_per_page'], pdf_meta['imgs_per_page'],
pdf_meta['text_layout_per_page'], # pdf_meta['text_layout_per_page'],
pdf_meta['invalid_chars'], pdf_meta['invalid_chars'],
) )
if is_text_pdf: if is_text_pdf:
......
...@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list): ...@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list, invalid_chars: bool): # text_layout_list: list,
invalid_chars: bool):
""" """
这里的图片和页面长度单位是pts 这里的图片和页面长度单位是pts
:param total_page: :param total_page:
...@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l ...@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_text_len': classify_by_text_len(text_len_list, total_page), 'by_text_len': classify_by_text_len(text_len_list, total_page),
'by_avg_words': classify_by_avg_words(text_len_list), 'by_avg_words': classify_by_avg_words(text_len_list),
'by_img_num': classify_by_img_num(img_sz_list, img_num_list), 'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
'by_text_layout': classify_by_text_layout(text_layout_list), # 'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list), 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
'by_invalid_chars': invalid_chars, 'by_invalid_chars': invalid_chars,
} }
...@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l ...@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
return False, results return False, results
else: else:
logger.warning( logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}," f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}," f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}," # f" by_text_layout: {results['by_text_layout']},"
f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
f" by_invalid_chars: {results['by_invalid_chars']}", f" by_invalid_chars: {results['by_invalid_chars']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results return False, results
......
...@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}") # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page = get_pdf_textlen_per_page(doc) text_len_per_page = get_pdf_textlen_per_page(doc)
# logger.info(f"text_len_per_page: {text_len_per_page}") # logger.info(f"text_len_per_page: {text_len_per_page}")
text_layout_per_page = get_pdf_text_layout_per_page(doc) # text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}") # logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language = get_language(doc) # text_language = get_language(doc)
# logger.info(f"text_language: {text_language}") # logger.info(f"text_language: {text_language}")
invalid_chars = check_invalid_chars(pdf_bytes) invalid_chars = check_invalid_chars(pdf_bytes)
# logger.info(f"invalid_chars: {invalid_chars}") # logger.info(f"invalid_chars: {invalid_chars}")
...@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
'page_height_pts': int(page_height_pts), 'page_height_pts': int(page_height_pts),
'image_info_per_page': image_info_per_page, 'image_info_per_page': image_info_per_page,
'text_len_per_page': text_len_per_page, 'text_len_per_page': text_len_per_page,
'text_layout_per_page': text_layout_per_page, # 'text_layout_per_page': text_layout_per_page,
'text_language': text_language, # 'text_language': text_language,
# "svgs_per_page": svgs_per_page, # "svgs_per_page": svgs_per_page,
'imgs_per_page': imgs_per_page, # 增加每页img数量list 'imgs_per_page': imgs_per_page, # 增加每页img数量list
'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list 'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment