"git@developer.sourcefind.cn:orangecat/ollama.git" did not exist on "5528dd9d1170e7a78a4fdb7684e8944e2052ca8f"
Commit f35a6c08 authored by myhloli's avatar myhloli
Browse files

refactor(filter): remove unused text layout analysis for PDF classification

parent 9f18ca20
......@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
pdf_meta['image_info_per_page'],
pdf_meta['text_len_per_page'],
pdf_meta['imgs_per_page'],
pdf_meta['text_layout_per_page'],
# pdf_meta['text_layout_per_page'],
pdf_meta['invalid_chars'],
)
if is_text_pdf:
......
......@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list, invalid_chars: bool):
# text_layout_list: list,
invalid_chars: bool):
"""
这里的图片和页面长度单位是pts
:param total_page:
......@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_text_len': classify_by_text_len(text_len_list, total_page),
'by_avg_words': classify_by_avg_words(text_len_list),
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
'by_text_layout': classify_by_text_layout(text_layout_list),
# 'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
'by_invalid_chars': invalid_chars,
}
......@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
return False, results
else:
logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
# f" by_text_layout: {results['by_text_layout']},"
f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
f" by_invalid_chars: {results['by_invalid_chars']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results
......
......@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page = get_pdf_textlen_per_page(doc)
# logger.info(f"text_len_per_page: {text_len_per_page}")
text_layout_per_page = get_pdf_text_layout_per_page(doc)
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language = get_language(doc)
# text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars = check_invalid_chars(pdf_bytes)
# logger.info(f"invalid_chars: {invalid_chars}")
......@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
'page_height_pts': int(page_height_pts),
'image_info_per_page': image_info_per_page,
'text_len_per_page': text_len_per_page,
'text_layout_per_page': text_layout_per_page,
'text_language': text_language,
# 'text_layout_per_page': text_layout_per_page,
# 'text_language': text_language,
# "svgs_per_page": svgs_per_page,
'imgs_per_page': imgs_per_page, # 增加每页img数量list
'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment