Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
2f0e5b2a
Unverified
Commit
2f0e5b2a
authored
Nov 27, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 27, 2024
Browse files
Merge pull request #1113 from myhloli/dev
refactor(ocr): remove unused functions and optimize OCR processing loop
parents
a65d6b53
5f4410b4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
8 additions
and
48 deletions
+8
-48
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+0
-28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+3
-3
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+5
-5
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+0
-12
No files found.
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
View file @
2f0e5b2a
import
math
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
loguru
import
logger
...
@@ -252,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
...
@@ -252,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
return
ocr_result_list
return
ocr_result_list
def
calculate_angle_degrees
(
poly
):
# 定义对角线的顶点
diagonal1
=
(
poly
[
0
],
poly
[
2
])
diagonal2
=
(
poly
[
1
],
poly
[
3
])
# 计算对角线的斜率
def
slope
(
p1
,
p2
):
return
(
p2
[
1
]
-
p1
[
1
])
/
(
p2
[
0
]
-
p1
[
0
])
if
p2
[
0
]
!=
p1
[
0
]
else
float
(
'inf'
)
slope1
=
slope
(
diagonal1
[
0
],
diagonal1
[
1
])
slope2
=
slope
(
diagonal2
[
0
],
diagonal2
[
1
])
# 计算对角线与x轴的夹角(以弧度为单位)
angle1_radians
=
math
.
atan
(
slope1
)
angle2_radians
=
math
.
atan
(
slope2
)
# 将弧度转换为角度
angle1_degrees
=
math
.
degrees
(
angle1_radians
)
angle2_degrees
=
math
.
degrees
(
angle2_radians
)
# 取两条对角线与x轴夹角的平均值
average_angle_degrees
=
abs
((
angle1_degrees
+
angle2_degrees
)
/
2
)
# logger.info(f"average_angle_degrees: {average_angle_degrees}")
return
average_angle_degrees
def
calculate_is_angle
(
poly
):
def
calculate_is_angle
(
poly
):
p1
,
p2
,
p3
,
p4
=
poly
p1
,
p2
,
p3
,
p4
=
poly
height
=
((
p4
[
1
]
-
p1
[
1
])
+
(
p3
[
1
]
-
p2
[
1
]))
/
2
height
=
((
p4
[
1
]
-
p1
[
1
])
+
(
p3
[
1
]
-
p2
[
1
]))
/
2
...
...
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
View file @
2f0e5b2a
...
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
...
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
if
det
and
rec
:
if
det
and
rec
:
ocr_res
=
[]
ocr_res
=
[]
for
idx
,
img
in
enumerate
(
imgs
)
:
for
img
in
imgs
:
img
=
preprocess_image
(
img
)
img
=
preprocess_image
(
img
)
dt_boxes
,
rec_res
,
_
=
self
.
__call__
(
img
,
cls
,
mfd_res
=
mfd_res
)
dt_boxes
,
rec_res
,
_
=
self
.
__call__
(
img
,
cls
,
mfd_res
=
mfd_res
)
if
not
dt_boxes
and
not
rec_res
:
if
not
dt_boxes
and
not
rec_res
:
...
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
...
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
return
ocr_res
return
ocr_res
elif
det
and
not
rec
:
elif
det
and
not
rec
:
ocr_res
=
[]
ocr_res
=
[]
for
idx
,
img
in
enumerate
(
imgs
)
:
for
img
in
imgs
:
img
=
preprocess_image
(
img
)
img
=
preprocess_image
(
img
)
dt_boxes
,
elapse
=
self
.
text_detector
(
img
)
dt_boxes
,
elapse
=
self
.
text_detector
(
img
)
if
dt_boxes
is
None
:
if
dt_boxes
is
None
:
...
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
...
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
else
:
else
:
ocr_res
=
[]
ocr_res
=
[]
cls_res
=
[]
cls_res
=
[]
for
idx
,
img
in
enumerate
(
imgs
)
:
for
img
in
imgs
:
if
not
isinstance
(
img
,
list
):
if
not
isinstance
(
img
,
list
):
img
=
preprocess_image
(
img
)
img
=
preprocess_image
(
img
)
img
=
[
img
]
img
=
[
img
]
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
2f0e5b2a
from
magic_pdf.config.ocr_content_type
import
BlockType
from
magic_pdf.config.ocr_content_type
import
BlockType
from
magic_pdf.libs.boxbase
import
(
from
magic_pdf.libs.boxbase
import
(
calculate_iou
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_iou
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_vertical_projection_overlap_ratio
,
calculate_vertical_projection_overlap_ratio
,
get_minbox_if_overlap_by_ratio
)
get_minbox_if_overlap_by_ratio
from
magic_pdf.pre_proc.remove_bbox_overlap
import
\
)
remove_overlap_between_bbox_for_block
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox_for_block
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
2f0e5b2a
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
calculate_overlap_area_in_bbox1_area_ratio
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
,
calculate_overlap_area_in_bbox1_area_ratio
...
@@ -82,14 +81,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -82,14 +81,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
if
calculate_overlap_area_in_bbox1_area_ratio
(
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
radio
:
span_bbox
,
block_bbox
)
>
radio
:
block_spans
.
append
(
span
)
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连'''
# 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict
[
'spans'
]
=
block_spans
block_dict
[
'spans'
]
=
block_spans
block_with_spans
.
append
(
block_dict
)
block_with_spans
.
append
(
block_dict
)
...
@@ -103,9 +94,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -103,9 +94,6 @@ def fill_spans_in_blocks(blocks, spans, radio):
def
fix_block_spans_v2
(
block_with_spans
):
def
fix_block_spans_v2
(
block_with_spans
):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks
=
[]
fix_blocks
=
[]
for
block
in
block_with_spans
:
for
block
in
block_with_spans
:
block_type
=
block
[
'type'
]
block_type
=
block
[
'type'
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment