Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
379cf150
Unverified
Commit
379cf150
authored
Dec 13, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 13, 2024
Browse files
Merge pull request #1291 from myhloli/add-llm-aided
fix(pdf): improve ligature handling and text extraction
parents
5e3890f5
c638fc5d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
16 deletions
+15
-16
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+0
-10
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+15
-6
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
379cf150
...
...
@@ -125,16 +125,6 @@ def detect_language(text):
return
'empty'
# 连写字符拆分
def
__replace_ligatures
(
text
:
str
):
text
=
re
.
sub
(
r
'fi'
,
'fi'
,
text
)
# 替换 fi 连写符
text
=
re
.
sub
(
r
'fl'
,
'fl'
,
text
)
# 替换 fl 连写符
text
=
re
.
sub
(
r
'ff'
,
'ff'
,
text
)
# 替换 ff 连写符
text
=
re
.
sub
(
r
'ffi'
,
'ffi'
,
text
)
# 替换 ffi 连写符
text
=
re
.
sub
(
r
'ffl'
,
'ffl'
,
text
)
# 替换 ffl 连写符
return
text
def
merge_para_with_text
(
para_block
):
block_text
=
''
for
line
in
para_block
[
'lines'
]:
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
379cf150
import
copy
import
os
import
re
import
statistics
import
time
from
typing
import
List
...
...
@@ -63,6 +64,15 @@ def __replace_0xfffd(text_str: str):
return
s
return
text_str
# 连写字符拆分
def
__replace_ligatures
(
text
:
str
):
ligatures
=
{
'fi'
:
'fi'
,
'fl'
:
'fl'
,
'ff'
:
'ff'
,
'ffi'
:
'ffi'
,
'ffl'
:
'ffl'
,
'ſt'
:
'ft'
,
'st'
:
'st'
}
return
re
.
sub
(
'|'
.
join
(
map
(
re
.
escape
,
ligatures
.
keys
())),
lambda
m
:
ligatures
[
m
.
group
()],
text
)
def
chars_to_content
(
span
):
# 检查span中的char是否为空
if
len
(
span
[
'chars'
])
==
0
:
...
...
@@ -83,6 +93,7 @@ def chars_to_content(span):
content
+=
' '
content
+=
char
[
'c'
]
content
=
__replace_ligatures
(
content
)
span
[
'content'
]
=
__replace_0xfffd
(
content
)
del
span
[
'chars'
]
...
...
@@ -152,9 +163,11 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
# cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
text_blocks_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXT_PRESERVE_WHITESPACE
|
fitz
.
TEXT_MEDIABOX_CLIP
)[
'blocks'
]
# cid用0xfffd表示,连字符不拆开
text_blocks_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXT_PRESERVE_LIGATURES
|
fitz
.
TEXT_PRESERVE_WHITESPACE
|
fitz
.
TEXT_MEDIABOX_CLIP
)[
'blocks'
]
all_pymu_chars
=
[]
for
block
in
text_blocks_raw
:
for
line
in
block
[
'lines'
]:
...
...
@@ -255,10 +268,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
return
spans
def
replace_text_span
(
pymu_spans
,
ocr_spans
):
return
list
(
filter
(
lambda
x
:
x
[
'type'
]
!=
ContentType
.
Text
,
ocr_spans
))
+
pymu_spans
def
model_init
(
model_name
:
str
):
from
transformers
import
LayoutLMv3ForTokenClassification
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment