Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3334157f
"src/array/cuda/sddmm_hetero_coo.hip" did not exist on "8ac27dad1a20b4228419e64746ae9110416e34ee"
Commit
3334157f
authored
Jun 05, 2025
by
myhloli
Browse files
refactor: clean up unused OCR area calculation and update demo PDF path
parent
236a6033
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
19 deletions
+10
-19
mineru/backend/pipeline/batch_analyze.py
mineru/backend/pipeline/batch_analyze.py
+0
-13
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+3
-0
mineru/cli/common.py
mineru/cli/common.py
+7
-6
No files found.
mineru/backend/pipeline/batch_analyze.py
View file @
3334157f
...
...
@@ -230,19 +230,6 @@ class BatchAnalyze:
ocr_result_list
=
get_ocr_result_list
(
ocr_res
,
useful_list
,
ocr_res_list_dict
[
'ocr_enable'
],
new_image
,
_lang
)
# if res["category_id"] == 3 and ocr_res_list_dict['ocr_enable']:
# # ocr_result_list中所有bbox的面积之和
# ocr_res_area = sum(
# get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
# # 求ocr_res_area和res的面积的比值
# res_area = get_coords_and_area(res)[4]
# if res_area > 0:
# ratio = ocr_res_area / res_area
# if ratio > 0.25:
# res["category_id"] = 1
# else:
# continue
ocr_res_list_dict
[
'layout_res'
].
extend
(
ocr_result_list
)
# 表格识别 table recognition
...
...
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
3334157f
...
...
@@ -48,6 +48,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""获取所有的spans信息"""
spans
=
magic_model
.
get_all_spans
()
"""某些图可能是文本块,通过简单的规则判断一下"""
if
len
(
maybe_text_image_blocks
)
>
0
:
for
block
in
maybe_text_image_blocks
:
span_in_block_list
=
[]
...
...
@@ -64,8 +65,10 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
if
ratio
>
0.25
and
ocr
:
# 移除block的group_id
block
.
pop
(
'group_id'
,
None
)
# 符合文本图的条件就把块加入到文本块列表中
text_blocks
.
append
(
block
)
else
:
# 如果不符合文本图的条件,就把块加回到图片块列表中
img_body_blocks
.
append
(
block
)
else
:
img_body_blocks
.
append
(
block
)
...
...
mineru/cli/common.py
View file @
3334157f
...
...
@@ -215,9 +215,10 @@ def do_parse(
if
__name__
==
"__main__"
:
pdf_path
=
"../../demo/pdfs/demo2.pdf"
with
open
(
pdf_path
,
"rb"
)
as
f
:
pdf_path
=
"../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf"
# pdf_path = "C:/Users/zhaoxiaomeng/Downloads/input_img_0.jpg"
try
:
do_parse
(
"./output"
,
[
Path
(
pdf_path
).
stem
],
[
f
.
read
(
)],[
"ch"
],
end_page_id
=
20
,)
do_parse
(
"./output"
,
[
Path
(
pdf_path
).
stem
],
[
read
_fn
(
Path
(
pdf_path
)
)],[
"ch"
],
end_page_id
=
20
,)
except
Exception
as
e
:
logger
.
exception
(
e
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment