Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b4dfa0f9
Unverified
Commit
b4dfa0f9
authored
Nov 28, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 28, 2024
Browse files
Merge pull request #1136 from myhloli/dev
refactor(ocr): improve text processing and span handling
parents
c295587b
88c0854a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
22 deletions
+13
-22
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+9
-21
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+4
-1
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
b4dfa0f9
...
...
@@ -136,14 +136,11 @@ def merge_para_with_text(para_block):
para_text
+=
'
\n
'
line_text
=
''
line_lang
=
''
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
if
line_text
!=
''
:
line_lang
=
detect_lang
(
line_text
)
for
j
,
span
in
enumerate
(
line
[
'spans'
]):
span_type
=
span
[
'type'
]
...
...
@@ -157,27 +154,18 @@ def merge_para_with_text(para_block):
content
=
content
.
strip
()
if
content
!=
''
:
langs
=
[
'zh'
,
'ja'
,
'ko'
]
if
line_lang
in
langs
:
# 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InterlineEquation
]:
para_text
+=
content
# 中文/日语/韩文语境下,content间不需要空格分隔
elif
span_type
==
ContentType
.
InlineEquation
:
para_text
+=
f
'
{
content
}
'
else
:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if
j
==
len
(
line
[
'spans'
])
-
1
and
__is_hyphen_at_line_end
(
content
):
para_text
+=
content
[:
-
1
]
elif
len
(
content
)
==
1
and
content
not
in
[
'A'
,
'I'
,
'a'
,
'i'
]
and
not
content
.
isdigit
():
para_text
+=
content
else
:
# 西方文本语境下 content间需要空格分隔
else
:
# content间需要空格分隔
para_text
+=
f
'
{
content
}
'
elif
span_type
==
ContentType
.
InterlineEquation
:
para_text
+=
content
else
:
continue
# 连写字符拆分
para_text
=
__replace_ligatures
(
para_text
)
#
para_text = __replace_ligatures(para_text)
return
para_text
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
b4dfa0f9
...
...
@@ -84,6 +84,9 @@ def chars_to_content(span):
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
def
fill_char_in_spans
(
spans
,
all_chars
):
# 简单从上到下排一下序
spans
=
sorted
(
spans
,
key
=
lambda
x
:
x
[
'bbox'
][
1
])
for
char
in
all_chars
:
for
span
in
spans
:
# 判断char是否属于LINE_STOP_FLAG
...
...
@@ -137,7 +140,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
text_blocks_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXT
FLAGS_TEXT
)[
'blocks'
]
text_blocks_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXT
_PRESERVE_WHITESPACE
|
fitz
.
TEXT_MEDIABOX_CLIP
|
fitz
.
TEXT_CID_FOR_UNKNOWN_UNICODE
)[
'blocks'
]
all_pymu_chars
=
[]
for
block
in
text_blocks_raw
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment