Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
dff11700
Commit
dff11700
authored
Jun 13, 2025
by
myhloli
Browse files
feat: update project list in README files to reflect compatibility with version 2.0
parent
d41179da
Changes
435
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
0 additions
and
432 deletions
+0
-432
projects/web_demo/web_demo/common/ext.py
projects/web_demo/web_demo/common/ext.py
+0
-94
projects/web_demo/web_demo/common/import_models.py
projects/web_demo/web_demo/common/import_models.py
+0
-1
projects/web_demo/web_demo/common/logger.py
projects/web_demo/web_demo/common/logger.py
+0
-19
projects/web_demo/web_demo/common/mk_markdown/__init__.py
projects/web_demo/web_demo/common/mk_markdown/__init__.py
+0
-0
projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
...cts/web_demo/web_demo/common/mk_markdown/libs/__init__.py
+0
-0
projects/web_demo/web_demo/common/mk_markdown/libs/language.py
...cts/web_demo/web_demo/common/mk_markdown/libs/language.py
+0
-36
projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
...b_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
+0
-31
projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
...demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
+0
-38
projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
+0
-169
projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
...mon/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
+0
-0
projects/web_demo/web_demo/common/web_hook.py
projects/web_demo/web_demo/common/web_hook.py
+0
-9
projects/web_demo/web_demo/config/__init__.py
projects/web_demo/web_demo/config/__init__.py
+0
-0
projects/web_demo/web_demo/config/config.yaml
projects/web_demo/web_demo/config/config.yaml
+0
-35
projects/web_demo/web_demo/config/mineru_web.db
projects/web_demo/web_demo/config/mineru_web.db
+0
-0
projects/web_demo/web_demo/static/__init__.py
projects/web_demo/web_demo/static/__init__.py
+0
-0
No files found.
projects/web_demo/web_demo/common/ext.py
deleted
100644 → 0
View file @
d41179da
import
hashlib
import
mimetypes
import
urllib.parse
def
is_pdf
(
filename
,
file
):
"""
判断文件是否为PDF格式,支持中文名和特殊字符。
:param filename: 文件名
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
try
:
# 对文件名进行URL解码,处理特殊字符
decoded_filename
=
urllib
.
parse
.
unquote
(
filename
)
# 检查MIME类型
mime_type
,
_
=
mimetypes
.
guess_type
(
decoded_filename
)
print
(
f
"Detected MIME type:
{
mime_type
}
"
)
# 某些情况下mime_type可能为None,需要特殊处理
if
mime_type
is
None
:
# 只检查文件内容的PDF标识
file_start
=
file
.
read
(
5
)
file
.
seek
(
0
)
# 重置文件指针
return
file_start
.
startswith
(
b
'%PDF-'
)
if
mime_type
!=
'application/pdf'
:
return
False
# 检查文件内容的PDF标识
file_start
=
file
.
read
(
5
)
file
.
seek
(
0
)
# 重置文件指针
if
not
file_start
.
startswith
(
b
'%PDF-'
):
return
False
return
True
except
Exception
as
e
:
print
(
f
"Error checking PDF format:
{
str
(
e
)
}
"
)
# 发生错误时,仍然尝试通过文件头判断
try
:
file_start
=
file
.
read
(
5
)
file
.
seek
(
0
)
return
file_start
.
startswith
(
b
'%PDF-'
)
except
:
return
False
def
url_is_pdf
(
file
):
"""
判断文件是否为PDF格式。
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件内容
file_start
=
file
.
read
(
5
)
file
.
seek
(
0
)
if
not
file_start
.
startswith
(
b
'%PDF-'
):
return
False
return
True
def
calculate_file_hash
(
file
,
algorithm
=
'sha256'
):
"""
计算给定文件的哈希值。
:param file: 文件对象
:param algorithm: 哈希算法的名字,如:'sha256', 'md5', 'sha1'等
:return: 文件的哈希值
"""
hash_func
=
getattr
(
hashlib
,
algorithm
)()
block_size
=
65536
# 64KB chunks
# with open(file_path, 'rb') as file:
buffer
=
file
.
read
(
block_size
)
while
len
(
buffer
)
>
0
:
hash_func
.
update
(
buffer
)
buffer
=
file
.
read
(
block_size
)
file
.
seek
(
0
)
return
hash_func
.
hexdigest
()
def
singleton_func
(
cls
):
instance
=
{}
def
_singleton
(
*
args
,
**
kwargs
):
if
cls
not
in
instance
:
instance
[
cls
]
=
cls
(
*
args
,
**
kwargs
)
return
instance
[
cls
]
return
_singleton
projects/web_demo/web_demo/common/import_models.py
deleted
100644 → 0
View file @
d41179da
from
api.analysis.models
import
*
\ No newline at end of file
projects/web_demo/web_demo/common/logger.py
deleted
100644 → 0
View file @
d41179da
import
os
from
loguru
import
logger
from
pathlib
import
Path
from
datetime
import
datetime
def
setup_log
(
config
):
"""
Setup logging
:param config: config file
:return:
"""
log_path
=
os
.
path
.
join
(
Path
(
__file__
).
parent
.
parent
,
"log"
)
if
not
Path
(
log_path
).
exists
():
Path
(
log_path
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
log_level
=
config
.
get
(
"LOG_LEVEL"
)
log_name
=
f
'log_
{
datetime
.
now
().
strftime
(
"%Y-%m-%d"
)
}
.log'
log_file_path
=
os
.
path
.
join
(
log_path
,
log_name
)
logger
.
add
(
str
(
log_file_path
),
rotation
=
'00:00'
,
encoding
=
'utf-8'
,
level
=
log_level
,
enqueue
=
True
)
projects/web_demo/web_demo/common/mk_markdown/__init__.py
deleted
100644 → 0
View file @
d41179da
projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
deleted
100644 → 0
View file @
d41179da
projects/web_demo/web_demo/common/mk_markdown/libs/language.py
deleted
100644 → 0
View file @
d41179da
import
os
import
unicodedata
if
not
os
.
getenv
(
"FTLANG_CACHE"
):
current_file_path
=
os
.
path
.
abspath
(
__file__
)
current_dir
=
os
.
path
.
dirname
(
current_file_path
)
root_dir
=
os
.
path
.
dirname
(
current_dir
)
ftlang_cache_dir
=
os
.
path
.
join
(
root_dir
,
'resources'
,
'fasttext-langdetect'
)
os
.
environ
[
"FTLANG_CACHE"
]
=
str
(
ftlang_cache_dir
)
# print(os.getenv("FTLANG_CACHE"))
from
fast_langdetect
import
detect_language
def
detect_lang
(
text
:
str
)
->
str
:
if
len
(
text
)
==
0
:
return
""
try
:
lang_upper
=
detect_language
(
text
)
except
:
html_no_ctrl_chars
=
''
.
join
([
l
for
l
in
text
if
unicodedata
.
category
(
l
)[
0
]
not
in
[
'C'
,
]])
lang_upper
=
detect_language
(
html_no_ctrl_chars
)
try
:
lang
=
lang_upper
.
lower
()
except
:
lang
=
""
return
lang
if
__name__
==
'__main__'
:
print
(
os
.
getenv
(
"FTLANG_CACHE"
))
print
(
detect_lang
(
"This is a test."
))
print
(
detect_lang
(
"<html>This is a test</html>"
))
print
(
detect_lang
(
"这个是中文测试。"
))
print
(
detect_lang
(
"<html>这个是中文测试。</html>"
))
projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
deleted
100644 → 0
View file @
d41179da
import
re
def
escape_special_markdown_char
(
pymu_blocks
):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars
=
[
"*"
,
"`"
,
"~"
,
"$"
]
for
blk
in
pymu_blocks
:
for
line
in
blk
[
'lines'
]:
for
span
in
line
[
'spans'
]:
for
char
in
special_chars
:
span_text
=
span
[
'text'
]
span_type
=
span
.
get
(
"_type"
,
None
)
if
span_type
in
[
'inline-equation'
,
'interline-equation'
]:
continue
elif
span_text
:
span
[
'text'
]
=
span
[
'text'
].
replace
(
char
,
"
\\
"
+
char
)
return
pymu_blocks
def
ocr_escape_special_markdown_char
(
content
):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars
=
[
"*"
,
"`"
,
"~"
,
"$"
]
for
char
in
special_chars
:
content
=
content
.
replace
(
char
,
"
\\
"
+
char
)
return
content
projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
deleted
100644 → 0
View file @
d41179da
class
ContentType
:
Image
=
'image'
Table
=
'table'
Text
=
'text'
InlineEquation
=
'inline_equation'
InterlineEquation
=
'interline_equation'
class
BlockType
:
Image
=
'image'
ImageBody
=
'image_body'
ImageCaption
=
'image_caption'
ImageFootnote
=
'image_footnote'
Table
=
'table'
TableBody
=
'table_body'
TableCaption
=
'table_caption'
TableFootnote
=
'table_footnote'
Text
=
'text'
Title
=
'title'
InterlineEquation
=
'interline_equation'
Footnote
=
'footnote'
Discarded
=
'discarded'
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
ImageFootnote
=
101
projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
deleted
100644 → 0
View file @
d41179da
import
re
import
wordninja
from
.libs.language
import
detect_lang
from
.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
.libs.ocr_content_type
import
BlockType
,
ContentType
def
__is_hyphen_at_line_end
(
line
):
"""
Check if a line ends with one or more letters followed by a hyphen.
Args:
line (str): The line of text to check.
Returns:
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
"""
# Use regex to check if the line ends with one or more letters followed by a hyphen
return
bool
(
re
.
search
(
r
'[A-Za-z]+-\s*$'
,
line
))
def
split_long_words
(
text
):
segments
=
text
.
split
(
' '
)
for
i
in
range
(
len
(
segments
)):
words
=
re
.
findall
(
r
'\w+|[^\w]'
,
segments
[
i
],
re
.
UNICODE
)
for
j
in
range
(
len
(
words
)):
if
len
(
words
[
j
])
>
10
:
words
[
j
]
=
' '
.
join
(
wordninja
.
split
(
words
[
j
]))
segments
[
i
]
=
''
.
join
(
words
)
return
' '
.
join
(
segments
)
def
join_path
(
*
args
):
return
''
.
join
(
str
(
s
).
rstrip
(
'/'
)
for
s
in
args
)
def
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_dict
:
list
,
img_buket_path
):
markdown_with_para_and_pagination
=
[]
page_no
=
0
for
page_info
in
pdf_info_dict
:
paras_of_layout
=
page_info
.
get
(
'para_blocks'
)
if
not
paras_of_layout
:
continue
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
'mm'
,
img_buket_path
)
markdown_with_para_and_pagination
.
append
({
'page_no'
:
page_no
,
'md_content'
:
'
\n\n
'
.
join
(
page_markdown
)
})
page_no
+=
1
return
markdown_with_para_and_pagination
def
merge_para_with_text
(
para_block
):
def
detect_language
(
text
):
en_pattern
=
r
'[a-zA-Z]+'
en_matches
=
re
.
findall
(
en_pattern
,
text
)
en_length
=
sum
(
len
(
match
)
for
match
in
en_matches
)
if
len
(
text
)
>
0
:
if
en_length
/
len
(
text
)
>=
0.5
:
return
'en'
else
:
return
'unknown'
else
:
return
'empty'
para_text
=
''
for
line
in
para_block
[
'lines'
]:
line_text
=
''
line_lang
=
''
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
if
line_text
!=
''
:
line_lang
=
detect_lang
(
line_text
)
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
content
=
''
if
span_type
==
ContentType
.
Text
:
content
=
span
[
'content'
]
# language = detect_lang(content)
language
=
detect_language
(
content
)
if
language
==
'en'
:
# 只对英文长词进行分词处理,中文分词会丢失文本
content
=
ocr_escape_special_markdown_char
(
split_long_words
(
content
))
else
:
content
=
ocr_escape_special_markdown_char
(
content
)
elif
span_type
==
ContentType
.
InlineEquation
:
content
=
f
" $
{
span
[
'content'
]
}
$ "
elif
span_type
==
ContentType
.
InterlineEquation
:
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
if
content
!=
''
:
langs
=
[
'zh'
,
'ja'
,
'ko'
]
if
line_lang
in
langs
:
# 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text
+=
content
# 中文/日语/韩文语境下,content间不需要空格分隔
elif
line_lang
==
'en'
:
# 如果是前一行带有-连字符,那么末尾不应该加空格
if
__is_hyphen_at_line_end
(
content
):
para_text
+=
content
[:
-
1
]
else
:
para_text
+=
content
+
' '
else
:
para_text
+=
content
+
' '
# 西方文本语境下 content间需要空格分隔
return
para_text
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
img_buket_path
=
''
):
page_markdown
=
[]
for
para_block
in
paras_of_layout
:
para_text
=
''
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Title
:
para_text
=
f
'#
{
merge_para_with_text
(
para_block
)
}
'
elif
para_type
==
BlockType
.
InterlineEquation
:
para_text
=
merge_para_with_text
(
para_block
)
elif
para_type
==
BlockType
.
Image
:
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼image_body
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Image
:
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
merge_para_with_text
(
block
)
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
continue
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Table
:
# if processed by table model
if
span
.
get
(
'latex'
,
''
):
para_text
+=
f
"
\n\n
$
\n
{
span
[
'latex'
]
}
\n
$
\n\n
"
elif
span
.
get
(
'html'
,
''
):
para_text
+=
f
"
\n\n
{
span
[
'html'
]
}
\n\n
"
else
:
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
if
para_text
.
strip
()
==
''
:
continue
else
:
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
return
page_markdown
projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
deleted
100644 → 0
View file @
d41179da
File deleted
projects/web_demo/web_demo/common/web_hook.py
deleted
100644 → 0
View file @
d41179da
def
before_request
():
return
None
def
after_request
(
response
):
response
.
headers
.
add
(
'Access-Control-Allow-Origin'
,
'*'
)
response
.
headers
.
add
(
'Access-Control-Allow-Headers'
,
'Content-Type,Authorization'
)
return
response
projects/web_demo/web_demo/config/__init__.py
deleted
100644 → 0
View file @
d41179da
projects/web_demo/web_demo/config/config.yaml
deleted
100644 → 0
View file @
d41179da
# 基本配置
BaseConfig
:
&base
DEBUG
:
false
PORT
:
5559
LOG_LEVEL
:
"
DEBUG"
SQLALCHEMY_TRACK_MODIFICATIONS
:
true
SQLALCHEMY_DATABASE_URI
:
"
"
PROPAGATE_EXCEPTIONS
:
true
SECRET_KEY
:
"
#$%^&**$##*(*^%%$**((&"
JWT_SECRET_KEY
:
"
#$%^&**$##*(*^%%$**((&"
JWT_ACCESS_TOKEN_EXPIRES
:
3600
PDF_UPLOAD_FOLDER
:
"
upload_pdf"
PDF_ANALYSIS_FOLDER
:
"
analysis_pdf"
# 前端项目打包的路径
REACT_APP_DIST
:
"
../../web/dist/"
# 文件访问路径
FILE_API
:
"
/api/v2/analysis/pdf_img?as_attachment=False"
# 开发配置
DevelopmentConfig
:
<<
:
*base
database
:
type
:
sqlite
path
:
config/mineru_web.db
# 生产配置
ProductionConfig
:
<<
:
*base
# 测试配置
TestingConfig
:
<<
:
*base
# 当前使用配置
CurrentConfig
:
"
DevelopmentConfig"
projects/web_demo/web_demo/config/mineru_web.db
deleted
100644 → 0
View file @
d41179da
File deleted
projects/web_demo/web_demo/static/__init__.py
deleted
100644 → 0
View file @
d41179da
Prev
1
…
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment