Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3d7c99b6
Unverified
Commit
3d7c99b6
authored
Jun 09, 2025
by
Adrian.Wang
Committed by
GitHub
Jun 09, 2025
Browse files
Merge branch 'opendatalab:dev' into dev
parents
ee79dd65
383fed52
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
8 deletions
+30
-8
magic_pdf/model/batch_analyze.py
magic_pdf/model/batch_analyze.py
+30
-8
No files found.
magic_pdf/model/batch_analyze.py
View file @
3d7c99b6
...
@@ -181,17 +181,16 @@ class BatchAnalyze:
...
@@ -181,17 +181,16 @@ class BatchAnalyze:
# 对每个分辨率组进行批处理
# 对每个分辨率组进行批处理
for
group_key
,
group_crops
in
tqdm
(
resolution_groups
.
items
(),
desc
=
f
"OCR-det
{
lang
}
"
):
for
group_key
,
group_crops
in
tqdm
(
resolution_groups
.
items
(),
desc
=
f
"OCR-det
{
lang
}
"
):
raw_images
=
[
crop_info
[
0
]
for
crop_info
in
group_crops
]
# 计算目标尺寸(组内最大尺寸,向上取整到32的倍数)
# 计算目标尺寸(组内最大尺寸,向上取整到32的倍数)
max_h
=
max
(
img
.
shape
[
0
]
for
img
in
raw_image
s
)
max_h
=
max
(
crop_info
[
0
]
.
shape
[
0
]
for
crop_info
in
group_crop
s
)
max_w
=
max
(
img
.
shape
[
1
]
for
img
in
raw_image
s
)
max_w
=
max
(
crop_info
[
0
]
.
shape
[
1
]
for
crop_info
in
group_crop
s
)
target_h
=
((
max_h
+
32
-
1
)
//
32
)
*
32
target_h
=
((
max_h
+
32
-
1
)
//
32
)
*
32
target_w
=
((
max_w
+
32
-
1
)
//
32
)
*
32
target_w
=
((
max_w
+
32
-
1
)
//
32
)
*
32
# 对所有图像进行padding到统一尺寸
# 对所有图像进行padding到统一尺寸
batch_images
=
[]
batch_images
=
[]
for
img
in
raw_images
:
for
crop_info
in
group_crops
:
img
=
crop_info
[
0
]
h
,
w
=
img
.
shape
[:
2
]
h
,
w
=
img
.
shape
[:
2
]
# 创建目标尺寸的白色背景
# 创建目标尺寸的白色背景
padded_img
=
np
.
ones
((
target_h
,
target_w
,
3
),
dtype
=
np
.
uint8
)
*
255
padded_img
=
np
.
ones
((
target_h
,
target_w
,
3
),
dtype
=
np
.
uint8
)
*
255
...
@@ -208,9 +207,32 @@ class BatchAnalyze:
...
@@ -208,9 +207,32 @@ class BatchAnalyze:
for
i
,
(
crop_info
,
(
dt_boxes
,
elapse
))
in
enumerate
(
zip
(
group_crops
,
batch_results
)):
for
i
,
(
crop_info
,
(
dt_boxes
,
elapse
))
in
enumerate
(
zip
(
group_crops
,
batch_results
)):
new_image
,
useful_list
,
ocr_res_list_dict
,
res
,
adjusted_mfdetrec_res
,
_lang
=
crop_info
new_image
,
useful_list
,
ocr_res_list_dict
,
res
,
adjusted_mfdetrec_res
,
_lang
=
crop_info
if
dt_boxes
is
not
None
:
if
dt_boxes
is
not
None
and
len
(
dt_boxes
)
>
0
:
# 构造OCR结果格式 - 每个box应该是4个点的列表
# 直接应用原始OCR流程中的关键处理步骤
ocr_res
=
[
box
.
tolist
()
for
box
in
dt_boxes
]
from
magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils
import
(
merge_det_boxes
,
update_det_boxes
,
sorted_boxes
)
# 1. 排序检测框
if
len
(
dt_boxes
)
>
0
:
dt_boxes_sorted
=
sorted_boxes
(
dt_boxes
)
else
:
dt_boxes_sorted
=
[]
# 2. 合并相邻检测框
if
dt_boxes_sorted
:
dt_boxes_merged
=
merge_det_boxes
(
dt_boxes_sorted
)
else
:
dt_boxes_merged
=
[]
# 3. 根据公式位置更新检测框(关键步骤!)
if
dt_boxes_merged
and
adjusted_mfdetrec_res
:
dt_boxes_final
=
update_det_boxes
(
dt_boxes_merged
,
adjusted_mfdetrec_res
)
else
:
dt_boxes_final
=
dt_boxes_merged
# 构造OCR结果格式
ocr_res
=
[
box
.
tolist
()
if
hasattr
(
box
,
'tolist'
)
else
box
for
box
in
dt_boxes_final
]
if
ocr_res
:
if
ocr_res
:
ocr_result_list
=
get_ocr_result_list
(
ocr_result_list
=
get_ocr_result_list
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment