Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1383787b
"test/vscode:/vscode.git/clone" did not exist on "cbf267c357a7574d9382ffcace75089dc2798295"
Commit
1383787b
authored
Jun 15, 2025
by
myhloli
Browse files
fix: refactor formula and table enable handling to use environment variables
parent
9b279553
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
17 additions
and
10 deletions
+17
-10
mineru/backend/pipeline/batch_analyze.py
mineru/backend/pipeline/batch_analyze.py
+3
-2
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+2
-2
mineru/backend/pipeline/pipeline_analyze.py
mineru/backend/pipeline/pipeline_analyze.py
+0
-4
mineru/cli/common.py
mineru/cli/common.py
+0
-2
mineru/utils/config_reader.py
mineru/utils/config_reader.py
+12
-0
No files found.
mineru/backend/pipeline/batch_analyze.py
View file @
1383787b
...
@@ -5,6 +5,7 @@ from collections import defaultdict
...
@@ -5,6 +5,7 @@ from collections import defaultdict
import
numpy
as
np
import
numpy
as
np
from
.model_init
import
AtomModelSingleton
from
.model_init
import
AtomModelSingleton
from
...utils.config_reader
import
get_formula_enable
,
get_table_enable
from
...utils.model_utils
import
crop_img
,
get_res_list_from_layout_res
from
...utils.model_utils
import
crop_img
,
get_res_list_from_layout_res
from
...utils.ocr_utils
import
get_adjusted_mfdetrec_res
,
get_ocr_result_list
,
OcrConfidence
from
...utils.ocr_utils
import
get_adjusted_mfdetrec_res
,
get_ocr_result_list
,
OcrConfidence
...
@@ -16,8 +17,8 @@ MFR_BASE_BATCH_SIZE = 16
...
@@ -16,8 +17,8 @@ MFR_BASE_BATCH_SIZE = 16
class
BatchAnalyze
:
class
BatchAnalyze
:
def
__init__
(
self
,
model_manager
,
batch_ratio
:
int
,
formula_enable
,
table_enable
,
enable_ocr_det_batch
:
bool
=
True
):
def
__init__
(
self
,
model_manager
,
batch_ratio
:
int
,
formula_enable
,
table_enable
,
enable_ocr_det_batch
:
bool
=
True
):
self
.
batch_ratio
=
batch_ratio
self
.
batch_ratio
=
batch_ratio
self
.
formula_enable
=
formula_enable
self
.
formula_enable
=
get_
formula_enable
(
formula_enable
)
self
.
table_enable
=
table_enable
self
.
table_enable
=
get_
table_enable
(
table_enable
)
self
.
model_manager
=
model_manager
self
.
model_manager
=
model_manager
self
.
enable_ocr_det_batch
=
enable_ocr_det_batch
self
.
enable_ocr_det_batch
=
enable_ocr_det_batch
...
...
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
1383787b
...
@@ -4,7 +4,7 @@ import time
...
@@ -4,7 +4,7 @@ import time
from
loguru
import
logger
from
loguru
import
logger
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
mineru.utils.config_reader
import
get_device
,
get_llm_aided_config
from
mineru.utils.config_reader
import
get_device
,
get_llm_aided_config
,
get_formula_enable
from
mineru.backend.pipeline.model_init
import
AtomModelSingleton
from
mineru.backend.pipeline.model_init
import
AtomModelSingleton
from
mineru.backend.pipeline.para_split
import
para_split
from
mineru.backend.pipeline.para_split
import
para_split
from
mineru.utils.block_pre_proc
import
prepare_block_bboxes
,
process_groups
from
mineru.utils.block_pre_proc
import
prepare_block_bboxes
,
process_groups
...
@@ -78,7 +78,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -78,7 +78,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""将所有区块的bbox整理到一起"""
"""将所有区块的bbox整理到一起"""
if
formula_enabled
:
if
get_formula_enable
(
formula_enabled
)
:
interline_equation_blocks
=
[]
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
...
...
mineru/backend/pipeline/pipeline_analyze.py
View file @
1383787b
...
@@ -189,10 +189,6 @@ def batch_image_analyze(
...
@@ -189,10 +189,6 @@ def batch_image_analyze(
batch_ratio
=
1
batch_ratio
=
1
logger
.
info
(
f
'Could not determine GPU memory, using default batch_ratio:
{
batch_ratio
}
'
)
logger
.
info
(
f
'Could not determine GPU memory, using default batch_ratio:
{
batch_ratio
}
'
)
if
os
.
getenv
(
'MINERU_FORMULA_ENABLE'
,
None
)
is
not
None
:
formula_enable
=
os
.
getenv
(
'MINERU_FORMULA_ENABLE'
).
lower
()
==
'true'
if
os
.
getenv
(
'MINERU_TABLE_ENABLE'
,
None
)
is
not
None
:
table_enable
=
os
.
getenv
(
'MINERU_TABLE_ENABLE'
).
lower
()
==
'true'
batch_model
=
BatchAnalyze
(
model_manager
,
batch_ratio
,
formula_enable
,
table_enable
)
batch_model
=
BatchAnalyze
(
model_manager
,
batch_ratio
,
formula_enable
,
table_enable
)
results
=
batch_model
(
images_with_extra_info
)
results
=
batch_model
(
images_with_extra_info
)
...
...
mineru/cli/common.py
View file @
1383787b
...
@@ -116,8 +116,6 @@ def do_parse(
...
@@ -116,8 +116,6 @@ def do_parse(
_lang
=
lang_list
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
if
os
.
getenv
(
'MINERU_FORMULA_ENABLE'
,
None
)
is
not
None
:
p_formula_enable
=
os
.
getenv
(
'MINERU_FORMULA_ENABLE'
).
lower
()
==
'true'
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
p_formula_enable
)
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
p_formula_enable
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_info
=
middle_json
[
"pdf_info"
]
...
...
mineru/utils/config_reader.py
View file @
1383787b
...
@@ -86,6 +86,18 @@ def get_device():
...
@@ -86,6 +86,18 @@ def get_device():
return
"cpu"
return
"cpu"
def
get_formula_enable
(
formula_enable
):
formula_enable_env
=
os
.
getenv
(
'MINERU_FORMULA_ENABLE'
)
formula_enable
=
formula_enable
if
formula_enable_env
is
None
else
formula_enable_env
.
lower
()
==
'true'
return
formula_enable
def
get_table_enable
(
table_enable
):
table_enable_env
=
os
.
getenv
(
'MINERU_TABLE_ENABLE'
)
table_enable
=
table_enable
if
table_enable_env
is
None
else
table_enable_env
.
lower
()
==
'true'
return
table_enable
def
get_latex_delimiter_config
():
def
get_latex_delimiter_config
():
config
=
read_config
()
config
=
read_config
()
if
config
is
None
:
if
config
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment