Unverified Commit b3fbedf0 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1143 from opendatalab/release-0.10.3

Release 0.10.3
parents 52ef1bc7 66bd0f8b
...@@ -5,7 +5,6 @@ from loguru import logger ...@@ -5,7 +5,6 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.para.para_split_v3 import ListLineTag from magic_pdf.para.para_split_v3 import ListLineTag
...@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, ...@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks') paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout: if not paras_of_layout:
markdown_with_para_and_pagination.append({
'page_no':
page_no,
'md_content':
'',
})
page_no += 1
continue continue
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path) paras_of_layout, 'mm', img_buket_path)
...@@ -136,14 +142,11 @@ def merge_para_with_text(para_block): ...@@ -136,14 +142,11 @@ def merge_para_with_text(para_block):
para_text += ' \n' para_text += ' \n'
line_text = '' line_text = ''
line_lang = ''
for span in line['spans']: for span in line['spans']:
span_type = span['type'] span_type = span['type']
if span_type == ContentType.Text: if span_type == ContentType.Text:
line_text += span['content'].strip() line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for j, span in enumerate(line['spans']): for j, span in enumerate(line['spans']):
span_type = span['type'] span_type = span['type']
...@@ -157,27 +160,18 @@ def merge_para_with_text(para_block): ...@@ -157,27 +160,18 @@ def merge_para_with_text(para_block):
content = content.strip() content = content.strip()
if content != '': if content != '':
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
if span_type in [ContentType.Text, ContentType.InterlineEquation]:
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔
elif span_type == ContentType.InlineEquation:
para_text += f' {content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]: if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content): if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
para_text += content[:-1] para_text += content[:-1]
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit(): else: # content间需要空格分隔
para_text += content
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} ' para_text += f'{content} '
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
para_text += content para_text += content
else: else:
continue continue
# 连写字符拆分 # 连写字符拆分
para_text = __replace_ligatures(para_text) # para_text = __replace_ligatures(para_text)
return para_text return para_text
......
...@@ -8,7 +8,7 @@ from loguru import logger ...@@ -8,7 +8,7 @@ from loguru import logger
from magic_pdf.config.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import get_top_percent_list, mymax from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
...@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document): ...@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
def check_invalid_chars(pdf_bytes): def check_invalid_chars(pdf_bytes):
"""乱码检测.""" """乱码检测."""
return detect_invalid_chars(pdf_bytes) return detect_invalid_chars_by_pymupdf(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes): def pdf_meta_scan(pdf_bytes: bytes):
......
from io import BytesIO
import re
import fitz import fitz
import numpy as np import numpy as np
from loguru import logger from loguru import logger
from pdfminer.high_level import extract_text # import re
# from io import BytesIO
# from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int): def calculate_sample_count(total_page: int):
...@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int): ...@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
return select_page_cnt return select_page_cnt
def extract_pages(src_pdf_bytes: bytes): def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
pdf_docs = fitz.open("pdf", src_pdf_bytes) pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs) total_page = len(pdf_docs)
if total_page == 0: if total_page == 0:
...@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes): ...@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
return sample_docs return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
"""" # """"
检测PDF中是否包含非法字符 # 检测PDF中是否包含非法字符
# """
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_docs = extract_pages(src_pdf_bytes)
# sample_pdf_bytes = sample_docs.tobytes()
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
# text = extract_text(sample_pdf_file_like_object)
# text = text.replace("\n", "")
# # logger.info(text)
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
# cid_pattern = re.compile(r'\(cid:\d+\)')
# matches = cid_pattern.findall(text)
# cid_count = len(matches)
# cid_len = sum(len(match) for match in matches)
# text_len = len(text)
# if text_len == 0:
# cid_chars_radio = 0
# else:
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
# if cid_chars_radio > 0.05:
# return False # 乱码文档
# else:
# return True # 正常文档
def count_replacement_characters(text: str) -> int:
"""
统计字符串中 0xfffd 字符的数量。
""" """
'''pdfminer比较慢,需要先随机抽取10页左右的sample''' return text.count('\ufffd')
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
sample_docs = extract_pages(src_pdf_bytes) sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes() doc_text = ""
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) for page in sample_docs:
text = extract_text(sample_pdf_file_like_object) page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
text = text.replace("\n", "") doc_text += page_text
# logger.info(text) text_len = len(doc_text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' uffd_count = count_replacement_characters(doc_text)
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0: if text_len == 0:
cid_chars_radio = 0 uffd_chars_radio = 0
else: else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len) uffd_chars_radio = uffd_count / text_len
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05: if uffd_chars_radio > 0.01:
return False # 乱码文档 return False # 乱码文档
else: else:
return True # 正常文档 return True # 正常文档
\ No newline at end of file
...@@ -18,11 +18,31 @@ def region_to_bbox(region): ...@@ -18,11 +18,31 @@ def region_to_bbox(region):
class CustomPaddleModel: class CustomPaddleModel:
def __init__(self, ocr: bool = False, show_log: bool = False, lang=None): def __init__(self,
ocr: bool = False,
show_log: bool = False,
lang=None,
det_db_box_thresh=0.3,
use_dilation=True,
det_db_unclip_ratio=1.8
):
if lang is not None: if lang is not None:
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang) self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
lang=lang,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
else: else:
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log) self.model = PPStructure(table=False,
ocr=True,
show_log=show_log,
det_db_box_thresh=det_db_box_thresh,
use_dilation=use_dilation,
det_db_unclip_ratio=det_db_unclip_ratio,
)
def __call__(self, img): def __call__(self, img):
try: try:
......
import cv2
import numpy as np import numpy as np
from loguru import logger from loguru import logger
from io import BytesIO
from PIL import Image
import base64
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
from ppocr.utils.utility import check_and_read
def img_decode(content: bytes):
np_arr = np.frombuffer(content, dtype=np.uint8)
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
def check_img(img):
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
image_file = img
img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f:
img_str = f.read()
img = img_decode(img_str)
if img is None:
try:
buf = BytesIO()
image = BytesIO(img_str)
im = Image.open(image)
rgb = im.convert('RGB')
rgb.save(buf, 'jpeg')
buf.seek(0)
image_bytes = buf.read()
data_base64 = str(base64.b64encode(image_bytes),
encoding="utf-8")
image_decode = base64.b64decode(data_base64)
img_array = np.frombuffer(image_decode, np.uint8)
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img
def bbox_to_points(bbox): def bbox_to_points(bbox):
""" 将bbox格式转换为四个顶点的数组 """ """ 将bbox格式转换为四个顶点的数组 """
......
import copy import copy
import time import time
import cv2 import cv2
import numpy as np import numpy as np
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
from paddleocr.paddleocr import check_img, logger from ppocr.utils.logging import get_logger
from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img from ppocr.utils.utility import alpha_to_color, binarize_img
from paddleocr.tools.infer.predict_system import sorted_boxes from tools.infer.predict_system import sorted_boxes
from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes logger = get_logger()
class ModifiedPaddleOCR(PaddleOCR): class ModifiedPaddleOCR(PaddleOCR):
......
...@@ -2,8 +2,8 @@ import os ...@@ -2,8 +2,8 @@ import os
import cv2 import cv2
import numpy as np import numpy as np
from paddleocr.ppstructure.table.predict_table import TableSystem from ppstructure.table.predict_table import TableSystem
from paddleocr.ppstructure.utility import init_args from ppstructure.utility import init_args
from PIL import Image from PIL import Image
from magic_pdf.config.constants import * # noqa: F403 from magic_pdf.config.constants import * # noqa: F403
......
import copy import copy
from loguru import logger
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.language import detect_lang
LINE_STOP_FLAG = ( LINE_STOP_FLAG = (
'.', '.',
...@@ -125,6 +128,9 @@ def __is_list_or_index_block(block): ...@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
# 添加所有文本,包括空行,保持与block['lines']长度一致 # 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list.append(line_text) lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
...@@ -135,10 +141,13 @@ def __is_list_or_index_block(block): ...@@ -135,10 +141,13 @@ def __is_list_or_index_block(block):
# 计算右侧是否顶格 # 计算右侧是否顶格
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height: if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
right_close_num += 1 right_close_num += 1
else:
# 类中文没有超长单词的情况,可以用统一的阈值
if block_lang in ['zh', 'ja', 'ko']:
closed_area = 0.26 * block_weight
else: else:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些,block窄的阈值要大 # block宽的阈值可以小些,block窄的阈值要大
if block_weight_radio >= 0.5: if block_weight_radio >= 0.5:
closed_area = 0.26 * block_weight closed_area = 0.26 * block_weight
else: else:
......
...@@ -30,22 +30,14 @@ try: ...@@ -30,22 +30,14 @@ try:
torchtext.disable_torchtext_deprecation_warning() torchtext.disable_torchtext_deprecation_warning()
except ImportError: except ImportError:
pass pass
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.para.para_split_v3 import para_split from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \ from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
ocr_prepare_bboxes_for_layout_split_v2 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_block_spans_v2,
fix_discarded_block)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
def __replace_STX_ETX(text_str: str): def __replace_STX_ETX(text_str: str):
...@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str): ...@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str):
return text_str return text_str
def __replace_0xfffd(text_str: str):
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
if text_str:
s = text_str.replace('\ufffd', " ")
return s
return text_str
def chars_to_content(span): def chars_to_content(span):
# 检查span中的char是否为空 # 检查span中的char是否为空
if len(span['chars']) == 0: if len(span['chars']) == 0:
span['content'] = '' pass
# span['content'] = ''
else: else:
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
...@@ -83,22 +83,24 @@ def chars_to_content(span): ...@@ -83,22 +83,24 @@ def chars_to_content(span):
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' ' content += ' '
content += char['c'] content += char['c']
span['content'] = __replace_STX_ETX(content)
span['content'] = __replace_0xfffd(content)
del span['chars'] del span['chars']
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
def fill_char_in_spans(spans, all_chars): def fill_char_in_spans(spans, all_chars):
# 简单从上到下排一下序
spans = sorted(spans, key=lambda x: x['bbox'][1])
for char in all_chars: for char in all_chars:
for span in spans: for span in spans:
# 判断char是否属于LINE_STOP_FLAG if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
if char['c'] in LINE_STOP_FLAG:
char_is_line_stop_flag = True
else:
char_is_line_stop_flag = False
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
span['chars'].append(char) span['chars'].append(char)
break break
...@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars): ...@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars):
for span in spans: for span in spans:
chars_to_content(span) chars_to_content(span)
if len(span['content']) == 0: # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
empty_spans.append(span) empty_spans.append(span)
del span['height'], span['width']
return empty_spans return empty_spans
# 使用鲁棒性更强的中心点坐标判断 # 使用鲁棒性更强的中心点坐标判断
def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2 char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2 char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2 span_center_y = (span_bbox[1] + span_bbox[3]) / 2
...@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
if ( if (
span_bbox[0] < char_center_x < span_bbox[2] span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 # 字符的中轴和span的中轴高度差不能超过1/4span高度 and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
): ):
return True return True
else: else:
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致) # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近 # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
if char_is_line_stop_flag: if char in LINE_STOP_FLAG:
if ( if (
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0] and char_center_x > span_bbox[0]
and span_bbox[1] < char_center_y < span_bbox[3] and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height / 4 and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
elif char in LINE_START_FLAG:
if (
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
): ):
return True return True
else: else:
...@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars = [] all_pymu_chars = []
for block in text_blocks_raw: for block in text_blocks_raw:
for line in block['lines']: for line in block['lines']:
cosine, sine = line['dir']
if abs (cosine) < 0.9 or abs(sine) > 0.1:
continue
for span in line['spans']: for span in line['spans']:
all_pymu_chars.extend(span['chars']) all_pymu_chars.extend(span['chars'])
...@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
continue continue
span_height = span['bbox'][3] - span['bbox'][1] span_height = span['bbox'][3] - span['bbox'][1]
span['height'] = span_height span['height'] = span_height
span['width'] = span['bbox'][2] - span['bbox'][0]
span_height_list.append(span_height) span_height_list.append(span_height)
if len(span_height_list) == 0: if len(span_height_list) == 0:
return spans return spans
...@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]: if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
continue continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3: if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
vertical_spans.append(span) vertical_spans.append(span)
elif block in all_bboxes: elif block in all_bboxes:
useful_spans.append(span) useful_spans.append(span)
else: else:
unuseful_spans.append(span) unuseful_spans.append(span)
del span['height']
break break
"""垂直的span框直接用pymu的line进行填充""" """垂直的span框直接用pymu的line进行填充"""
...@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if ocr_res and len(ocr_res) > 0: if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0: if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0] ocr_text, ocr_score = ocr_res[0][0]
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
if ocr_score > 0.5 and len(ocr_text) > 0: if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text span['content'] = ocr_text
span['score'] = ocr_score span['score'] = ocr_score
......
...@@ -4,10 +4,10 @@ click>=8.1.7 ...@@ -4,10 +4,10 @@ click>=8.1.7
fast-langdetect==0.2.0 fast-langdetect==0.2.0
loguru>=0.6.0 loguru>=0.6.0
numpy>=1.21.6,<2.0.0 numpy>=1.21.6,<2.0.0
pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0 pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9 PyMuPDF>=1.24.9
scikit-learn>=1.0.2 scikit-learn>=1.0.2
torch>=2.2.2,<=2.3.1 torch>=2.2.2,<=2.3.1
transformers transformers
# pdfminer.six==20231228
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment