Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
61e88cb2
Unverified
Commit
61e88cb2
authored
Nov 25, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 25, 2024
Browse files
Merge pull request #1086 from myhloli/dev
refactor(txt_spans_extract_v2): optimize span processing and OCR logic
parents
6c4040ac
160624bd
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
40 additions
and
46 deletions
+40
-46
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+7
-2
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+33
-44
No files found.
magic_pdf/para/para_split_v3.py
View file @
61e88cb2
...
@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
...
@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
first_span
=
first_line
[
'spans'
][
0
]
first_span
=
first_line
[
'spans'
][
0
]
if
len
(
first_span
[
'content'
])
>
0
:
if
len
(
first_span
[
'content'
])
>
0
:
span_start_with_num
=
first_span
[
'content'
][
0
].
isdigit
()
span_start_with_num
=
first_span
[
'content'
][
0
].
isdigit
()
span_start_with_big_char
=
first_span
[
'content'
][
0
].
isupper
()
if
(
if
(
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
# 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
<
line_height
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
<
line_height
# 上一个block的最后一个span不是以特定符号结尾
and
not
last_span
[
'content'
].
endswith
(
LINE_STOP_FLAG
)
and
not
last_span
[
'content'
].
endswith
(
LINE_STOP_FLAG
)
# 两个block宽度差距超过2倍也不合并
# 两个block宽度差距超过2倍也不合并
and
abs
(
block1_weight
-
block2_weight
)
<
min_block_weight
and
abs
(
block1_weight
-
block2_weight
)
<
min_block_weight
# 下一个block的第一个字符是数字
and
not
span_start_with_num
and
not
span_start_with_num
# 下一个block的第一个字符是大写字母
and
not
span_start_with_big_char
):
):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
line
in
block1
[
'lines'
]:
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
61e88cb2
...
@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
...
@@ -89,28 +89,25 @@ def __replace_STX_ETX(text_str: str):
def
chars_to_content
(
span
):
def
chars_to_content
(
span
):
# # 先给chars按char['bbox']的x坐标排序
# 检查span中的char是否为空
# span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
if
len
(
span
[
'chars'
])
==
0
:
span
[
'content'
]
=
''
else
:
# 先给chars按char['bbox']的中心点的x坐标排序
# 先给chars按char['bbox']的中心点的x坐标排序
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
(
x
[
'bbox'
][
0
]
+
x
[
'bbox'
][
2
])
/
2
)
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
(
x
[
'bbox'
][
0
]
+
x
[
'bbox'
][
2
])
/
2
)
content
=
''
# 求char的平均宽度
# 求char的平均宽度
if
len
(
span
[
'chars'
])
==
0
:
span
[
'content'
]
=
content
del
span
[
'chars'
]
return
else
:
char_width_sum
=
sum
([
char
[
'bbox'
][
2
]
-
char
[
'bbox'
][
0
]
for
char
in
span
[
'chars'
]])
char_width_sum
=
sum
([
char
[
'bbox'
][
2
]
-
char
[
'bbox'
][
0
]
for
char
in
span
[
'chars'
]])
char_avg_width
=
char_width_sum
/
len
(
span
[
'chars'
])
char_avg_width
=
char_width_sum
/
len
(
span
[
'chars'
])
content
=
''
for
char
in
span
[
'chars'
]:
for
char
in
span
[
'chars'
]:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
if
char
[
'bbox'
][
0
]
-
span
[
'chars'
][
span
[
'chars'
].
index
(
char
)
-
1
][
'bbox'
][
2
]
>
char_avg_width
:
if
char
[
'bbox'
][
0
]
-
span
[
'chars'
][
span
[
'chars'
].
index
(
char
)
-
1
][
'bbox'
][
2
]
>
char_avg_width
:
content
+=
' '
content
+=
' '
content
+=
char
[
'c'
]
content
+=
char
[
'c'
]
span
[
'content'
]
=
__replace_STX_ETX
(
content
)
span
[
'content'
]
=
__replace_STX_ETX
(
content
)
del
span
[
'chars'
]
del
span
[
'chars'
]
...
@@ -128,8 +125,13 @@ def fill_char_in_spans(spans, all_chars):
...
@@ -128,8 +125,13 @@ def fill_char_in_spans(spans, all_chars):
span
[
'chars'
].
append
(
char
)
span
[
'chars'
].
append
(
char
)
break
break
empty_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
chars_to_content
(
span
)
chars_to_content
(
span
)
if
len
(
span
[
'content'
])
==
0
:
empty_spans
.
append
(
span
)
return
empty_spans
# 使用鲁棒性更强的中心点坐标判断
# 使用鲁棒性更强的中心点坐标判断
...
@@ -162,48 +164,37 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
...
@@ -162,48 +164,37 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
text_blocks
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars
=
[]
for
block
in
text_blocks
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
all_pymu_chars
.
extend
(
span
[
'chars'
])
useful_spans
=
[]
useful_spans
=
[]
unuseful_spans
=
[]
unuseful_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
for
block
in
all_bboxes
:
for
block
in
all_bboxes
+
all_discarded_blocks
:
if
block
[
7
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
if
block
[
7
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
continue
continue
else
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
if
block
in
all_bboxes
:
useful_spans
.
append
(
span
)
useful_spans
.
append
(
span
)
break
else
:
for
block
in
all_discarded_blocks
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
unuseful_spans
.
append
(
span
)
unuseful_spans
.
append
(
span
)
break
break
text_blocks
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
# @todo: 拿到char之后把倾斜角度较大的先删一遍
all_pymu_chars
=
[]
for
block
in
text_blocks
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
all_pymu_chars
.
extend
(
span
[
'chars'
])
new_spans
=
[]
new_spans
=
[]
for
span
in
useful_spans
:
for
span
in
useful_spans
+
unuseful_spans
:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
span
[
'chars'
]
=
[]
new_spans
.
append
(
span
)
for
span
in
unuseful_spans
:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
span
[
'chars'
]
=
[]
span
[
'chars'
]
=
[]
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
fill_char_in_spans
(
new_spans
,
all_pymu_chars
)
empty_spans
=
fill_char_in_spans
(
new_spans
,
all_pymu_chars
)
empty_spans
=
[]
for
span
in
new_spans
:
if
len
(
span
[
'content'
])
==
0
:
empty_spans
.
append
(
span
)
if
len
(
empty_spans
)
>
0
:
if
len
(
empty_spans
)
>
0
:
# 初始化ocr模型
# 初始化ocr模型
...
@@ -216,18 +207,16 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
...
@@ -216,18 +207,16 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
)
)
for
span
in
empty_spans
:
for
span
in
empty_spans
:
spans
.
remove
(
span
)
# 对span的bbox截图再ocr
# 对span的bbox截图
span_img
=
cut_image_to_pil_image
(
span
[
'bbox'
],
pdf_page
,
mode
=
"cv2"
)
span_img
=
cut_image_to_pil_image
(
span
[
'bbox'
],
pdf_page
,
mode
=
"cv2"
)
ocr_res
=
ocr_model
.
ocr
(
span_img
,
det
=
False
)
ocr_res
=
ocr_model
.
ocr
(
span_img
,
det
=
False
)
# logger.info(f"ocr_res: {ocr_res}")
# logger.info(f"empty_span: {span}")
if
ocr_res
and
len
(
ocr_res
)
>
0
:
if
ocr_res
and
len
(
ocr_res
)
>
0
:
if
len
(
ocr_res
[
0
])
>
0
:
if
len
(
ocr_res
[
0
])
>
0
:
ocr_text
,
ocr_score
=
ocr_res
[
0
][
0
]
ocr_text
,
ocr_score
=
ocr_res
[
0
][
0
]
if
ocr_score
>
0.5
and
len
(
ocr_text
)
>
0
:
if
ocr_score
>
0.5
and
len
(
ocr_text
)
>
0
:
span
[
'content'
]
=
ocr_text
span
[
'content'
]
=
ocr_text
spans
.
append
(
span
)
else
:
spans
.
remove
(
span
)
return
spans
return
spans
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment