Commit f2115541 authored by myhloli's avatar myhloli
Browse files

refactor: improve text processing by adding ligature and unicode replacement functions

parent 76e1a7c1
...@@ -132,7 +132,6 @@ class BatchAnalyze: ...@@ -132,7 +132,6 @@ class BatchAnalyze:
# 获取OCR模型 # 获取OCR模型
ocr_model = atom_model_manager.get_atom_model( ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr', atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3, det_db_box_thresh=0.3,
lang=lang lang=lang
) )
......
...@@ -38,7 +38,7 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -38,7 +38,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
def span_block_type_compatible(span_type, block_type): def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.TEXT, ContentType.INTERLINE_EQUATION]: if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
return block_type in [ return block_type in [
BlockType.TEXT, BlockType.TEXT,
BlockType.TITLE, BlockType.TITLE,
......
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import re
import cv2 import cv2
import numpy as np import numpy as np
...@@ -100,6 +101,19 @@ def remove_overlaps_min_spans(spans): ...@@ -100,6 +101,19 @@ def remove_overlaps_min_spans(spans):
return spans, dropped_spans return spans, dropped_spans
def __replace_ligatures(text: str):
ligatures = {
'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
def __replace_unicode(text: str):
ligatures = {
'\r\n': '', '\u0002': '-',
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
def txt_spans_extract(pdf_page, spans, pil_img, scale): def txt_spans_extract(pdf_page, spans, pil_img, scale):
textpage = pdf_page.get_textpage() textpage = pdf_page.get_textpage()
...@@ -117,6 +131,8 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale): ...@@ -117,6 +131,8 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale):
text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1], text = textpage.get_text_bounded(left=rect_box[0], top=rect_box[1],
right=rect_box[2], bottom=rect_box[3]) right=rect_box[2], bottom=rect_box[3])
if text and len(text) > 0: if text and len(text) > 0:
text = __replace_unicode(text)
text = __replace_ligatures(text)
span['content'] = text.strip() span['content'] = text.strip()
span['score'] = 1.0 span['score'] = 1.0
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment