"docs/source/vscode:/vscode.git/clone" did not exist on "474510f2515a67412c20f023060b764cc7d20b43"
Commit 084dc22a authored by 赵小蒙's avatar 赵小蒙
Browse files

update AVG_TEXT_LEN_THRESHOLD 200->100

parent 6c52856d
......@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
TEXT_LEN_THRESHOLD = 100
AVG_TEXT_LEN_THRESHOLD = 200
AVG_TEXT_LEN_THRESHOLD = 100
TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
......@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
# 如果宽达标,检测是否能竖着拼
if full_width:
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
# 如果高达标,检测是否可以横着拼
if full_height:
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
# Check if the image can be merged with the last image
if (full_width and close1) or (full_height and close2):
......@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
# 先对每个id出现的次数做个统计
objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
# 再去掉出现次数大于10的
if total_page >= scan_max_page:# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
total_page = scan_max_page
repeat_threshold = 2 # 把bad_image的阈值设为2
# repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
......@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
# return True
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in img_sz_list] # 过滤掉重复出现的图片
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
img_sz_list] # 过滤掉重复出现的图片
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
img_sz_list = merge_images(img_sz_list, page_width, page_height)
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in img_sz_list]
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
img_sz_list]
page_area = page_width * page_height
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
return False
else:
return True
def classify_by_text_len(text_len_list: list, total_page: int):
"""
随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
......@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
return is_text_pdf
def classify_by_avg_words(text_len_list: list):
"""
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
......@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
return is_text_pdf
def classify_by_img_num(img_sz_list: list, img_num_list: list):
"""
补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
......@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
# img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
#拿max和min的值,用来判断list内的值是否全都相等
# min_imgs = min(img_num_list)
# max_imgs = max(img_num_list)
#
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
#拿max和min的值,用来判断list内的值是否全都相等
# min_imgs = min(img_num_list)
# max_imgs = max(img_num_list)
#
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
return False # 如果满足这个条件,一定不是文字版pdf
else:
return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
......@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
else:
return False # 文本布局未知,默认认为不是文字版pdf
def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
"""
判断一页是否由细长条组成,有两个条件:
......@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
Returns:
bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
"""
def is_narrow_strip(img):
x0, y0, x1, y1, _ = img
width, height = x1 - x0, y1 - y0
......@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
return narrow_strip_pages_ratio < 0.5
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
text_layout_list: list):
"""
这里的图片和页面长度单位是pts
:param total_page:
......@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
elif not any(results.values()):
return False, results
else:
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
logger.warning(
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment