Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
59435d88
Commit
59435d88
authored
Mar 13, 2025
by
myhloli
Browse files
Merge remote-tracking branch 'origin/dev' into dev
parents
c545a94e
6116488d
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
55 additions
and
18 deletions
+55
-18
magic_pdf/libs/version.py
magic_pdf/libs/version.py
+1
-1
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+4
-2
magic_pdf/post_proc/para_split_v3.py
magic_pdf/post_proc/para_split_v3.py
+16
-13
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+9
-1
requirements.txt
requirements.txt
+1
-1
signatures/version1/cla.json
signatures/version1/cla.json
+24
-0
No files found.
magic_pdf/libs/version.py
View file @
59435d88
__version__
=
"1.
1.0
"
__version__
=
"1.
2.2
"
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
59435d88
...
@@ -165,12 +165,14 @@ def doc_analyze(
...
@@ -165,12 +165,14 @@ def doc_analyze(
import
torch_npu
import
torch_npu
if
torch_npu
.
npu
.
is_available
():
if
torch_npu
.
npu
.
is_available
():
npu_support
=
True
npu_support
=
True
torch
.
npu
.
set_compile_mode
(
jit_compile
=
False
)
if
torch
.
cuda
.
is_available
()
and
device
!=
'cpu'
or
npu_support
:
if
torch
.
cuda
.
is_available
()
and
device
!=
'cpu'
or
npu_support
:
gpu_memory
=
int
(
os
.
getenv
(
"VIRTUAL_VRAM_SIZE"
,
round
(
get_vram
(
device
))))
gpu_memory
=
int
(
os
.
getenv
(
"VIRTUAL_VRAM_SIZE"
,
round
(
get_vram
(
device
))))
if
gpu_memory
is
not
None
and
gpu_memory
>=
8
:
if
gpu_memory
is
not
None
and
gpu_memory
>=
8
:
if
gpu_memory
>=
20
:
if
gpu_memory
>=
16
:
batch_ratio
=
16
elif
gpu_memory
>=
15
:
batch_ratio
=
8
batch_ratio
=
8
elif
gpu_memory
>=
10
:
elif
gpu_memory
>=
10
:
batch_ratio
=
4
batch_ratio
=
4
...
...
magic_pdf/post_proc/para_split_v3.py
View file @
59435d88
...
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
...
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
):
):
multiple_para_flag
=
True
multiple_para_flag
=
True
for
line
in
block
[
'lines'
]:
block_text
=
''
line_mid_x
=
(
line
[
'bbox'
][
0
]
+
line
[
'bbox'
][
2
])
/
2
block_mid_x
=
(
block
[
'bbox_fs'
][
0
]
+
block
[
'bbox_fs'
][
2
])
/
2
if
(
line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
]
>
0.7
*
line_height
and
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
0.7
*
line_height
):
external_sides_not_close_num
+=
1
if
abs
(
line_mid_x
-
block_mid_x
)
<
line_height
/
2
:
center_close_num
+=
1
for
line
in
block
[
'lines'
]:
line_text
=
''
line_text
=
''
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
line_text
+=
span
[
'content'
].
strip
()
# 添加所有文本,包括空行,保持与block['lines']长度一致
# 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list
.
append
(
line_text
)
lines_text_list
.
append
(
line_text
)
block_text
=
''
.
join
(
lines_text_list
)
block_text
=
''
.
join
(
lines_text_list
)
block_lang
=
detect_lang
(
block_text
)
# logger.info(f"block_lang: {block_lang}")
block_lang
=
detect_lang
(
block_text
)
# logger.info(f"block_lang: {block_lang}")
for
line
in
block
[
'lines'
]:
line_mid_x
=
(
line
[
'bbox'
][
0
]
+
line
[
'bbox'
][
2
])
/
2
block_mid_x
=
(
block
[
'bbox_fs'
][
0
]
+
block
[
'bbox_fs'
][
2
])
/
2
if
(
line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
]
>
0.7
*
line_height
and
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
0.7
*
line_height
):
external_sides_not_close_num
+=
1
if
abs
(
line_mid_x
-
block_mid_x
)
<
line_height
/
2
:
center_close_num
+=
1
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
if
abs
(
block
[
'bbox_fs'
][
0
]
-
line
[
'bbox'
][
0
])
<
line_height
/
2
:
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
59435d88
...
@@ -62,7 +62,15 @@ def merge_spans_to_line(spans, threshold=0.6):
...
@@ -62,7 +62,15 @@ def merge_spans_to_line(spans, threshold=0.6):
def
span_block_type_compatible
(
span_type
,
block_type
):
def
span_block_type_compatible
(
span_type
,
block_type
):
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
return
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]
return
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
,
BlockType
.
Discarded
]
elif
span_type
==
ContentType
.
InterlineEquation
:
elif
span_type
==
ContentType
.
InterlineEquation
:
return
block_type
in
[
BlockType
.
InterlineEquation
,
BlockType
.
Text
]
return
block_type
in
[
BlockType
.
InterlineEquation
,
BlockType
.
Text
]
elif
span_type
==
ContentType
.
Image
:
elif
span_type
==
ContentType
.
Image
:
...
...
requirements.txt
View file @
59435d88
boto3
>=1.28.43
boto3
>=1.28.43
Brotli
>=1.1.0
Brotli
>=1.1.0
click
>=8.1.7
click
>=8.1.7
fast-langdetect
>=0.2.3
fast-langdetect
>=0.2.3
,<0.3.0
loguru
>=0.6.0
loguru
>=0.6.0
numpy
>=1.21.6,<2.0.0
numpy
>=1.21.6,<2.0.0
pydantic
>=2.7.2
pydantic
>=2.7.2
...
...
signatures/version1/cla.json
View file @
59435d88
...
@@ -159,6 +159,30 @@
...
@@ -159,6 +159,30 @@
"created_at"
:
"2025-02-22T07:15:35Z"
,
"created_at"
:
"2025-02-22T07:15:35Z"
,
"repoId"
:
765083837
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1743
"pullRequestNo"
:
1743
},
{
"name"
:
"nadahlberg"
,
"id"
:
58701810
,
"comment_id"
:
2676309097
,
"created_at"
:
"2025-02-22T17:04:14Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1748
},
{
"name"
:
"BetterAndBetterII"
,
"id"
:
141388234
,
"comment_id"
:
2680567709
,
"created_at"
:
"2025-02-25T05:21:05Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1767
},
{
"name"
:
"luckymore"
,
"id"
:
5390013
,
"comment_id"
:
2684392503
,
"created_at"
:
"2025-02-26T09:23:25Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1785
}
}
]
]
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment