refactor(filter): remove unused text layout analysis for PDF classification

f35a6c08 · myhloli · 9f18ca20 · f35a6c08 · f35a6c08 · f35a6c08
Commit f35a6c08 authored Feb 09, 2025 by myhloli
3 changed files
--- a/magic_pdf/filter/__init__.py
+++ b/magic_pdf/filter/__init__.py
@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
                pdf_meta['image_info_per_page'],
                pdf_meta['text_len_per_page'],
                pdf_meta['imgs_per_page'],
-                pdf_meta['text_layout_per_page'],
+                # pdf_meta['text_layout_per_page'],
                pdf_meta['invalid_chars'],
            )
            if is_text_pdf:

--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):


 def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             text_layout_list: list, invalid_chars: bool):
+             # text_layout_list: list,
+             invalid_chars: bool):
    """
    这里的图片和页面长度单位是pts
    :param total_page:
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
        'by_text_len': classify_by_text_len(text_len_list, total_page),
        'by_avg_words': classify_by_avg_words(text_len_list),
        'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        'by_text_layout': classify_by_text_layout(text_layout_list),
+        # 'by_text_layout': classify_by_text_layout(text_layout_list),
        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
        'by_invalid_chars': invalid_chars,
    }
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
        return False, results
    else:
        logger.warning(
-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
+            f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
+            # f" by_text_layout: {results['by_text_layout']},"
+            f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
            f" by_invalid_chars: {results['by_invalid_chars']}",
            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
        return False, results

--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
        text_len_per_page = get_pdf_textlen_per_page(doc)
        # logger.info(f"text_len_per_page: {text_len_per_page}")
-        text_layout_per_page = get_pdf_text_layout_per_page(doc)
+        # text_layout_per_page = get_pdf_text_layout_per_page(doc)
        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        text_language = get_language(doc)
+        # text_language = get_language(doc)
        # logger.info(f"text_language: {text_language}")
        invalid_chars = check_invalid_chars(pdf_bytes)
        # logger.info(f"invalid_chars: {invalid_chars}")
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
            'page_height_pts': int(page_height_pts),
            'image_info_per_page': image_info_per_page,
            'text_len_per_page': text_len_per_page,
-            'text_layout_per_page': text_layout_per_page,
-            'text_language': text_language,
+            # 'text_layout_per_page': text_layout_per_page,
+            # 'text_language': text_language,
            # "svgs_per_page": svgs_per_page,
            'imgs_per_page': imgs_per_page,  # 增加每页img数量list
            'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list