Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f35a6c08
"git@developer.sourcefind.cn:orangecat/ollama.git" did not exist on "5528dd9d1170e7a78a4fdb7684e8944e2052ca8f"
Commit
f35a6c08
authored
Feb 09, 2025
by
myhloli
Browse files
refactor(filter): remove unused text layout analysis for PDF classification
parent
9f18ca20
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
9 deletions
+11
-9
magic_pdf/filter/__init__.py
magic_pdf/filter/__init__.py
+1
-1
magic_pdf/filter/pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+6
-4
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+4
-4
No files found.
magic_pdf/filter/__init__.py
View file @
f35a6c08
...
...
@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
pdf_meta
[
'image_info_per_page'
],
pdf_meta
[
'text_len_per_page'
],
pdf_meta
[
'imgs_per_page'
],
pdf_meta
[
'text_layout_per_page'
],
#
pdf_meta['text_layout_per_page'],
pdf_meta
[
'invalid_chars'
],
)
if
is_text_pdf
:
...
...
magic_pdf/filter/pdf_classify_by_type.py
View file @
f35a6c08
...
...
@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
def
classify
(
total_page
:
int
,
page_width
,
page_height
,
img_sz_list
:
list
,
text_len_list
:
list
,
img_num_list
:
list
,
text_layout_list
:
list
,
invalid_chars
:
bool
):
# text_layout_list: list,
invalid_chars
:
bool
):
"""
这里的图片和页面长度单位是pts
:param total_page:
...
...
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
'by_text_len'
:
classify_by_text_len
(
text_len_list
,
total_page
),
'by_avg_words'
:
classify_by_avg_words
(
text_len_list
),
'by_img_num'
:
classify_by_img_num
(
img_sz_list
,
img_num_list
),
'by_text_layout'
:
classify_by_text_layout
(
text_layout_list
),
#
'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips'
:
classify_by_img_narrow_strips
(
page_width
,
page_height
,
img_sz_list
),
'by_invalid_chars'
:
invalid_chars
,
}
...
...
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
return
False
,
results
else
:
logger
.
warning
(
f
"
pdf is not classified by area and text_len
, by_image_area:
{
results
[
'by_image_area'
]
}
,"
f
"
OCR needed based on classification result
, by_image_area:
{
results
[
'by_image_area'
]
}
,"
f
" by_text:
{
results
[
'by_text_len'
]
}
, by_avg_words:
{
results
[
'by_avg_words'
]
}
, by_img_num:
{
results
[
'by_img_num'
]
}
,"
f
" by_text_layout:
{
results
[
'by_text_layout'
]
}
, by_img_narrow_strips:
{
results
[
'by_img_narrow_strips'
]
}
,"
# f" by_text_layout: {results['by_text_layout']},"
f
" by_img_narrow_strips:
{
results
[
'by_img_narrow_strips'
]
}
,"
f
" by_invalid_chars:
{
results
[
'by_invalid_chars'
]
}
"
,
file
=
sys
.
stderr
)
# 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return
False
,
results
...
...
magic_pdf/filter/pdf_meta_scan.py
View file @
f35a6c08
...
...
@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page
=
get_pdf_textlen_per_page
(
doc
)
# logger.info(f"text_len_per_page: {text_len_per_page}")
text_layout_per_page
=
get_pdf_text_layout_per_page
(
doc
)
#
text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
text_language
=
get_language
(
doc
)
#
text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars
=
check_invalid_chars
(
pdf_bytes
)
# logger.info(f"invalid_chars: {invalid_chars}")
...
...
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
'page_height_pts'
:
int
(
page_height_pts
),
'image_info_per_page'
:
image_info_per_page
,
'text_len_per_page'
:
text_len_per_page
,
'text_layout_per_page'
:
text_layout_per_page
,
'text_language'
:
text_language
,
#
'text_layout_per_page': text_layout_per_page,
#
'text_language': text_language,
# "svgs_per_page": svgs_per_page,
'imgs_per_page'
:
imgs_per_page
,
# 增加每页img数量list
'junk_img_bojids'
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment