Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d9b5d004
Commit
d9b5d004
authored
Jul 05, 2025
by
myhloli
Browse files
refactor: update content type references in pipeline and VLM processing scripts
parent
e6f817fe
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
24 deletions
+22
-24
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
+11
-14
mineru/backend/vlm/vlm_middle_json_mkcontent.py
mineru/backend/vlm/vlm_middle_json_mkcontent.py
+10
-10
mineru/utils/enum_class.py
mineru/utils/enum_class.py
+1
-0
No files found.
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py
View file @
d9b5d004
...
...
@@ -193,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_content
=
{}
if
para_type
in
[
BlockType
.
TEXT
,
BlockType
.
LIST
,
BlockType
.
INDEX
]:
para_content
=
{
'type'
:
'text'
,
'type'
:
ContentType
.
TEXT
,
'text'
:
merge_para_with_text
(
para_block
),
}
elif
para_type
==
BlockType
.
TITLE
:
para_content
=
{
'type'
:
'text'
,
'type'
:
ContentType
.
TEXT
,
'text'
:
merge_para_with_text
(
para_block
),
}
title_level
=
get_title_level
(
para_block
)
...
...
@@ -208,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if
len
(
para_block
[
'lines'
])
==
0
or
len
(
para_block
[
'lines'
][
0
][
'spans'
])
==
0
:
return
None
para_content
=
{
'type'
:
'equation'
,
'type'
:
ContentType
.
EQUATION
,
'img_path'
:
f
"
{
img_buket_path
}
/
{
para_block
[
'lines'
][
0
][
'spans'
][
0
].
get
(
'image_path'
,
''
)
}
"
,
}
if
para_block
[
'lines'
][
0
][
'spans'
][
0
].
get
(
'content'
,
''
):
para_content
[
'text'
]
=
merge_para_with_text
(
para_block
)
para_content
[
'text_format'
]
=
'latex'
elif
para_type
==
BlockType
.
IMAGE
:
para_content
=
{
'type'
:
'image'
,
'img_path'
:
''
,
'img_caption'
:
[],
'img_footnote'
:
[]}
para_content
=
{
'type'
:
ContentType
.
IMAGE
,
'img_path'
:
''
,
BlockType
.
IMAGE_CAPTION
:
[],
BlockType
.
IMAGE_FOOTNOTE
:
[]}
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
IMAGE_BODY
:
for
line
in
block
[
'lines'
]:
...
...
@@ -224,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if
span
.
get
(
'image_path'
,
''
):
para_content
[
'img_path'
]
=
f
"
{
img_buket_path
}
/
{
span
[
'image_path'
]
}
"
if
block
[
'type'
]
==
BlockType
.
IMAGE_CAPTION
:
para_content
[
'img_caption'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
IMAGE_CAPTION
].
append
(
merge_para_with_text
(
block
))
if
block
[
'type'
]
==
BlockType
.
IMAGE_FOOTNOTE
:
para_content
[
'img_footnote'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
IMAGE_FOOTNOTE
].
append
(
merge_para_with_text
(
block
))
elif
para_type
==
BlockType
.
TABLE
:
para_content
=
{
'type'
:
'table'
,
'img_path'
:
''
,
'table_caption'
:
[],
'table_footnote'
:
[]}
para_content
=
{
'type'
:
ContentType
.
TABLE
,
'img_path'
:
''
,
BlockType
.
TABLE_CAPTION
:
[],
BlockType
.
TABLE_FOOTNOTE
:
[]}
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TABLE_BODY
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
TABLE
:
if
span
.
get
(
'latex'
,
''
):
para_content
[
'table_body'
]
=
f
"
{
span
[
'latex'
]
}
"
elif
span
.
get
(
'html'
,
''
):
para_content
[
'table_body'
]
=
f
"
{
span
[
'html'
]
}
"
if
span
.
get
(
'html'
,
''
):
para_content
[
BlockType
.
TABLE_BODY
]
=
f
"
{
span
[
'html'
]
}
"
if
span
.
get
(
'image_path'
,
''
):
para_content
[
'img_path'
]
=
f
"
{
img_buket_path
}
/
{
span
[
'image_path'
]
}
"
if
block
[
'type'
]
==
BlockType
.
TABLE_CAPTION
:
para_content
[
'table_caption'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
TABLE_CAPTION
].
append
(
merge_para_with_text
(
block
))
if
block
[
'type'
]
==
BlockType
.
TABLE_FOOTNOTE
:
para_content
[
'table_footnote'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
TABLE_FOOTNOTE
].
append
(
merge_para_with_text
(
block
))
para_content
[
'page_idx'
]
=
page_idx
...
...
mineru/backend/vlm/vlm_middle_json_mkcontent.py
View file @
d9b5d004
...
...
@@ -130,25 +130,25 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_content
=
{}
if
para_type
in
[
BlockType
.
TEXT
,
BlockType
.
LIST
,
BlockType
.
INDEX
]:
para_content
=
{
'type'
:
'text'
,
'type'
:
ContentType
.
TEXT
,
'text'
:
merge_para_with_text
(
para_block
),
}
elif
para_type
==
BlockType
.
TITLE
:
title_level
=
get_title_level
(
para_block
)
para_content
=
{
'type'
:
'text'
,
'type'
:
ContentType
.
TEXT
,
'text'
:
merge_para_with_text
(
para_block
),
}
if
title_level
!=
0
:
para_content
[
'text_level'
]
=
title_level
elif
para_type
==
BlockType
.
INTERLINE_EQUATION
:
para_content
=
{
'type'
:
'equation'
,
'type'
:
ContentType
.
EQUATION
,
'text'
:
merge_para_with_text
(
para_block
),
'text_format'
:
'latex'
,
}
elif
para_type
==
BlockType
.
IMAGE
:
para_content
=
{
'type'
:
'image'
,
'img_path'
:
''
,
'img_caption'
:
[],
'img_footnote'
:
[]}
para_content
=
{
'type'
:
ContentType
.
IMAGE
,
'img_path'
:
''
,
BlockType
.
IMAGE_CAPTION
:
[],
BlockType
.
IMAGE_FOOTNOTE
:
[]}
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
IMAGE_BODY
:
for
line
in
block
[
'lines'
]:
...
...
@@ -157,11 +157,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if
span
.
get
(
'image_path'
,
''
):
para_content
[
'img_path'
]
=
f
"
{
img_buket_path
}
/
{
span
[
'image_path'
]
}
"
if
block
[
'type'
]
==
BlockType
.
IMAGE_CAPTION
:
para_content
[
'img_caption'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
IMAGE_CAPTION
].
append
(
merge_para_with_text
(
block
))
if
block
[
'type'
]
==
BlockType
.
IMAGE_FOOTNOTE
:
para_content
[
'img_footnote'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
IMAGE_FOOTNOTE
].
append
(
merge_para_with_text
(
block
))
elif
para_type
==
BlockType
.
TABLE
:
para_content
=
{
'type'
:
'table'
,
'img_path'
:
''
,
'table_caption'
:
[],
'table_footnote'
:
[]}
para_content
=
{
'type'
:
ContentType
.
TABLE
,
'img_path'
:
''
,
BlockType
.
TABLE_CAPTION
:
[],
BlockType
.
TABLE_FOOTNOTE
:
[]}
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TABLE_BODY
:
for
line
in
block
[
'lines'
]:
...
...
@@ -169,15 +169,15 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if
span
[
'type'
]
==
ContentType
.
TABLE
:
if
span
.
get
(
'html'
,
''
):
para_content
[
'table_body'
]
=
f
"
{
span
[
'html'
]
}
"
para_content
[
BlockType
.
TABLE_BODY
]
=
f
"
{
span
[
'html'
]
}
"
if
span
.
get
(
'image_path'
,
''
):
para_content
[
'img_path'
]
=
f
"
{
img_buket_path
}
/
{
span
[
'image_path'
]
}
"
if
block
[
'type'
]
==
BlockType
.
TABLE_CAPTION
:
para_content
[
'table_caption'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
TABLE_CAPTION
].
append
(
merge_para_with_text
(
block
))
if
block
[
'type'
]
==
BlockType
.
TABLE_FOOTNOTE
:
para_content
[
'table_footnote'
].
append
(
merge_para_with_text
(
block
))
para_content
[
BlockType
.
TABLE_FOOTNOTE
].
append
(
merge_para_with_text
(
block
))
para_content
[
'page_idx'
]
=
page_idx
...
...
mineru/utils/enum_class.py
View file @
d9b5d004
...
...
@@ -21,6 +21,7 @@ class ContentType:
TEXT
=
'text'
INTERLINE_EQUATION
=
'interline_equation'
INLINE_EQUATION
=
'inline_equation'
EQUATION
=
'equation'
class
CategoryId
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment