Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
8998380d
"docs/vscode:/vscode.git/clone" did not exist on "65ecbb94921aa961a9be643100262c9abafc1830"
Commit
8998380d
authored
Jun 20, 2024
by
赵小蒙
Browse files
update check invalid_chars algorithm to improve accuracy
parent
35a700da
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
8 deletions
+11
-8
magic_pdf/libs/pdf_check.py
magic_pdf/libs/pdf_check.py
+11
-8
No files found.
magic_pdf/libs/pdf_check.py
View file @
8998380d
...
@@ -6,15 +6,11 @@ from loguru import logger
...
@@ -6,15 +6,11 @@ from loguru import logger
from
pdfminer.high_level
import
extract_text
from
pdfminer.high_level
import
extract_text
def
calculate_sample_count
(
total_page
:
int
,
sample_ratio
=
0.1
):
def
calculate_sample_count
(
total_page
:
int
):
"""
"""
根据总页数和采样率计算采样页面的数量。
根据总页数和采样率计算采样页面的数量。
"""
"""
select_page_cnt
=
int
(
total_page
*
sample_ratio
)
select_page_cnt
=
min
(
10
,
total_page
)
if
select_page_cnt
<
5
:
select_page_cnt
=
min
(
10
,
total_page
)
elif
select_page_cnt
>
10
:
select_page_cnt
=
10
return
select_page_cnt
return
select_page_cnt
...
@@ -46,14 +42,21 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
...
@@ -46,14 +42,21 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
sample_pdf_bytes
=
sample_docs
.
tobytes
()
sample_pdf_bytes
=
sample_docs
.
tobytes
()
sample_pdf_file_like_object
=
BytesIO
(
sample_pdf_bytes
)
sample_pdf_file_like_object
=
BytesIO
(
sample_pdf_bytes
)
text
=
extract_text
(
sample_pdf_file_like_object
)
text
=
extract_text
(
sample_pdf_file_like_object
)
text
=
text
.
replace
(
"
\n
"
,
""
)
# logger.info(text)
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern
=
re
.
compile
(
r
'\(cid:\d+\)'
)
cid_pattern
=
re
.
compile
(
r
'\(cid:\d+\)'
)
matches
=
cid_pattern
.
findall
(
text
)
matches
=
cid_pattern
.
findall
(
text
)
cid_count
=
len
(
matches
)
cid_count
=
len
(
matches
)
cid_len
=
sum
(
len
(
match
)
for
match
in
matches
)
text_len
=
len
(
text
)
text_len
=
len
(
text
)
logger
.
info
(
f
"cid_count:
{
cid_count
}
, text_len:
{
text_len
}
"
)
if
text_len
==
0
:
if
cid_count
>
10
:
cid_chars_radio
=
0
else
:
cid_chars_radio
=
cid_count
/
(
cid_count
+
text_len
-
cid_len
)
logger
.
info
(
f
"cid_count:
{
cid_count
}
, text_len:
{
text_len
}
, cid_chars_radio:
{
cid_chars_radio
}
"
)
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if
cid_chars_radio
>
0.05
:
return
False
# 乱码文档
return
False
# 乱码文档
else
:
else
:
return
True
# 正常文档
return
True
# 正常文档
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment