"test/git@developer.sourcefind.cn:change/sglang.git" did not exist on "e7bc600304e98fa54184f4d7331b4e68016890b4"
Commit c638fc5d authored by myhloli's avatar myhloli
Browse files

fix(pdf): improve ligature handling and text extraction

- Move ligature replacement function to pdf_parse_union_core_v2.py
- Optimize ligature replacement using a more efficient approach
- Modify text extraction flags to preserve ligatures in PDF content
- Remove unnecessary function from ocr_mkcontent.py
parent 6a75d7dc
...@@ -125,16 +125,6 @@ def detect_language(text): ...@@ -125,16 +125,6 @@ def detect_language(text):
return 'empty' return 'empty'
# 连写字符拆分
def __replace_ligatures(text: str):
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
return text
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
block_text = '' block_text = ''
for line in para_block['lines']: for line in para_block['lines']:
......
import copy import copy
import os import os
import re
import statistics import statistics
import time import time
from typing import List from typing import List
...@@ -63,6 +64,15 @@ def __replace_0xfffd(text_str: str): ...@@ -63,6 +64,15 @@ def __replace_0xfffd(text_str: str):
return s return s
return text_str return text_str
# 连写字符拆分
def __replace_ligatures(text: str):
ligatures = {
'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
def chars_to_content(span): def chars_to_content(span):
# 检查span中的char是否为空 # 检查span中的char是否为空
if len(span['chars']) == 0: if len(span['chars']) == 0:
...@@ -83,6 +93,7 @@ def chars_to_content(span): ...@@ -83,6 +93,7 @@ def chars_to_content(span):
content += ' ' content += ' '
content += char['c'] content += char['c']
content = __replace_ligatures(content)
span['content'] = __replace_0xfffd(content) span['content'] = __replace_0xfffd(content)
del span['chars'] del span['chars']
...@@ -152,9 +163,11 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): ...@@ -152,9 +163,11 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] # cid用0xfffd表示,连字符不拆开
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
all_pymu_chars = [] all_pymu_chars = []
for block in text_blocks_raw: for block in text_blocks_raw:
for line in block['lines']: for line in block['lines']:
...@@ -255,10 +268,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -255,10 +268,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
return spans return spans
def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
def model_init(model_name: str): def model_init(model_name: str):
from transformers import LayoutLMv3ForTokenClassification from transformers import LayoutLMv3ForTokenClassification
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment