update AVG_TEXT_LEN_THRESHOLD 200->100

084dc22a · 赵小蒙 · 6c52856d · 084dc22a
Commit 084dc22a authored Jun 18, 2024 by 赵小蒙
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 17 deletions

magic_pdf/filter/pdf_classify_by_type.py magic_pdf/filter/pdf_classify_by_type.py +25 -17

No files found.
--- a/magic_pdf/filter/pdf_classify_by_type.py
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
 from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min

 TEXT_LEN_THRESHOLD = 100
-AVG_TEXT_LEN_THRESHOLD = 200
+AVG_TEXT_LEN_THRESHOLD = 100
 TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计


@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
            # 如果宽达标，检测是否能竖着拼
            if full_width:
                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
-                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
+                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
+                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)

            # 如果高达标，检测是否可以横着拼
            if full_height:
                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
-                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
+                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
+                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)

            # Check if the image can be merged with the last image
            if (full_width and close1) or (full_height and close2):
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
    # 先对每个id出现的次数做个统计
    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
    # 再去掉出现次数大于10的
-    if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
+    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
        total_page = scan_max_page

-
    repeat_threshold = 2  # 把bad_image的阈值设为2
    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
    #     return True

-    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list]  # 过滤掉重复出现的图片
-
+    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
+                   img_sz_list]  # 过滤掉重复出现的图片

    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
    img_sz_list = merge_images(img_sz_list, page_width, page_height)

    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
-    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               img_sz_list]
    page_area = page_width * page_height
    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]

-    if len(max_image_area_per_page) >= 0.5 * total_page:   # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
+    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
        return False
    else:
        return True


-
 def classify_by_text_len(text_len_list: list, total_page: int):
    """
    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
    return is_text_pdf

+
 def classify_by_avg_words(text_len_list: list):
    """
    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):

    return is_text_pdf

+
 def classify_by_img_num(img_sz_list: list, img_num_list: list):
    """
    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:

-    #拿max和min的值,用来判断list内的值是否全都相等
-    # min_imgs = min(img_num_list)
-    # max_imgs = max(img_num_list)
-    #
-    # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
+        #拿max和min的值,用来判断list内的值是否全都相等
+        # min_imgs = min(img_num_list)
+        # max_imgs = max(img_num_list)
+        #
+        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
        return False  # 如果满足这个条件，一定不是文字版pdf
    else:
        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
    else:
        return False  # 文本布局未知，默认认为不是文字版pdf

+
 def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    """
    判断一页是否由细长条组成，有两个条件：
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    Returns:
        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
    """
+
    def is_narrow_strip(img):
        x0, y0, x1, y1, _ = img
        width, height = x1 - x0, y1 - y0
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
    return narrow_strip_pages_ratio < 0.5


-def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
+def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
+             text_layout_list: list):
    """
    这里的图片和页面长度单位是pts
    :param total_page:
@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
    elif not any(results.values()):
        return False, results
    else:
-        logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
+        logger.warning(
+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
+            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
        return False, results