Unverified Commit b03a7fae authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1153 from opendatalab/release-0.10.4

Release 0.10.4
parents d19911f1 9726403c
...@@ -5,6 +5,7 @@ from loguru import logger ...@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.para.para_split_v3 import ListLineTag from magic_pdf.para.para_split_v3 import ListLineTag
...@@ -135,18 +136,19 @@ def __replace_ligatures(text: str): ...@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.Text]:
block_text += span['content']
block_lang = detect_lang(block_text)
para_text = '' para_text = ''
for i, line in enumerate(para_block['lines']): for i, line in enumerate(para_block['lines']):
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
para_text += ' \n' para_text += ' \n'
line_text = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
for j, span in enumerate(line['spans']): for j, span in enumerate(line['spans']):
span_type = span['type'] span_type = span['type']
...@@ -159,12 +161,21 @@ def merge_para_with_text(para_block): ...@@ -159,12 +161,21 @@ def merge_para_with_text(para_block):
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
content = content.strip() content = content.strip()
if content != '':
if content:
langs = ['zh', 'ja', 'ko']
# logger.info(f'block_lang: {block_lang}, content: {content}')
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
if j == len(line['spans']) - 1:
para_text += content
else:
para_text += f'{content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]: if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content): if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
para_text += content[:-1] para_text += content[:-1]
else: # content间需要空格分隔 else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} ' para_text += f'{content} '
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
para_text += content para_text += content
......
...@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2( ...@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
"""将剩余的bbox做分离处理,防止后面分layout时出错""" """将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes.sort(key=lambda x: x[0]+x[1])
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment