Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0f83b568
Unverified
Commit
0f83b568
authored
Nov 11, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 11, 2024
Browse files
Merge pull request #922 from myhloli/dev
refactor(model download script)
parents
fd646101
9496c6c4
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
10 additions
and
131 deletions
+10
-131
README.md
README.md
+2
-0
README_zh-CN.md
README_zh-CN.md
+2
-0
docs/download_models.py
docs/download_models.py
+0
-59
docs/download_models_hf.py
docs/download_models_hf.py
+0
-66
docs/how_to_download_models_en.md
docs/how_to_download_models_en.md
+1
-1
docs/how_to_download_models_zh_cn.md
docs/how_to_download_models_zh_cn.md
+2
-2
magic-pdf.template.json
magic-pdf.template.json
+1
-1
scripts/download_models.py
scripts/download_models.py
+1
-1
scripts/download_models_hf.py
scripts/download_models_hf.py
+1
-1
No files found.
README.md
View file @
0f83b568
...
@@ -243,7 +243,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
...
@@ -243,7 +243,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
# Acknowledgments
# Acknowledgments
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-Kit
)
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-Kit
)
-
[
DocLayout-YOLO
](
https://github.com/opendatalab/DocLayout-YOLO
)
-
[
StructEqTable
](
https://github.com/UniModal4Reasoning/StructEqTable-Deploy
)
-
[
StructEqTable
](
https://github.com/UniModal4Reasoning/StructEqTable-Deploy
)
-
[
RapidTable
](
https://github.com/RapidAI/RapidTable
)
-
[
PaddleOCR
](
https://github.com/PaddlePaddle/PaddleOCR
)
-
[
PaddleOCR
](
https://github.com/PaddlePaddle/PaddleOCR
)
-
[
PyMuPDF
](
https://github.com/pymupdf/PyMuPDF
)
-
[
PyMuPDF
](
https://github.com/pymupdf/PyMuPDF
)
-
[
layoutreader
](
https://github.com/ppaanngggg/layoutreader
)
-
[
layoutreader
](
https://github.com/ppaanngggg/layoutreader
)
...
...
README_zh-CN.md
View file @
0f83b568
...
@@ -253,7 +253,9 @@ TODO
...
@@ -253,7 +253,9 @@ TODO
# Acknowledgments
# Acknowledgments
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-Kit
)
-
[
PDF-Extract-Kit
](
https://github.com/opendatalab/PDF-Extract-Kit
)
-
[
DocLayout-YOLO
](
https://github.com/opendatalab/DocLayout-YOLO
)
-
[
StructEqTable
](
https://github.com/UniModal4Reasoning/StructEqTable-Deploy
)
-
[
StructEqTable
](
https://github.com/UniModal4Reasoning/StructEqTable-Deploy
)
-
[
RapidTable
](
https://github.com/RapidAI/RapidTable
)
-
[
PaddleOCR
](
https://github.com/PaddlePaddle/PaddleOCR
)
-
[
PaddleOCR
](
https://github.com/PaddlePaddle/PaddleOCR
)
-
[
PyMuPDF
](
https://github.com/pymupdf/PyMuPDF
)
-
[
PyMuPDF
](
https://github.com/pymupdf/PyMuPDF
)
-
[
layoutreader
](
https://github.com/ppaanngggg/layoutreader
)
-
[
layoutreader
](
https://github.com/ppaanngggg/layoutreader
)
...
...
docs/download_models.py
deleted
100644 → 0
View file @
fd646101
import
json
import
os
import
requests
from
modelscope
import
snapshot_download
def
download_json
(
url
):
# 下载JSON文件
response
=
requests
.
get
(
url
)
response
.
raise_for_status
()
# 检查请求是否成功
return
response
.
json
()
def
download_and_modify_json
(
url
,
local_filename
,
modifications
):
if
os
.
path
.
exists
(
local_filename
):
data
=
json
.
load
(
open
(
local_filename
))
config_version
=
data
.
get
(
'config_version'
,
'0.0.0'
)
if
config_version
<
'1.0.0'
:
data
=
download_json
(
url
)
else
:
data
=
download_json
(
url
)
# 修改内容
for
key
,
value
in
modifications
.
items
():
data
[
key
]
=
value
# 保存修改后的内容
with
open
(
local_filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
if
__name__
==
'__main__'
:
mineru_patterns
=
[
"models/Layout/LayoutLMv3/*"
,
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_small/*"
,
"models/TabRec/TableMaster/*"
,
"models/TabRec/StructEqTable/*"
,
]
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit-1.0'
,
allow_patterns
=
mineru_patterns
)
layoutreader_model_dir
=
snapshot_download
(
'ppaanngggg/layoutreader'
)
model_dir
=
model_dir
+
'/models'
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
json_url
=
'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
json_mods
=
{
'models-dir'
:
model_dir
,
'layoutreader-model-dir'
:
layoutreader_model_dir
,
}
download_and_modify_json
(
json_url
,
config_file
,
json_mods
)
print
(
f
'The configuration file has been configured successfully, the path is:
{
config_file
}
'
)
docs/download_models_hf.py
deleted
100644 → 0
View file @
fd646101
import
json
import
os
import
requests
from
huggingface_hub
import
snapshot_download
def
download_json
(
url
):
# 下载JSON文件
response
=
requests
.
get
(
url
)
response
.
raise_for_status
()
# 检查请求是否成功
return
response
.
json
()
def
download_and_modify_json
(
url
,
local_filename
,
modifications
):
if
os
.
path
.
exists
(
local_filename
):
data
=
json
.
load
(
open
(
local_filename
))
config_version
=
data
.
get
(
'config_version'
,
'0.0.0'
)
if
config_version
<
'1.0.0'
:
data
=
download_json
(
url
)
else
:
data
=
download_json
(
url
)
# 修改内容
for
key
,
value
in
modifications
.
items
():
data
[
key
]
=
value
# 保存修改后的内容
with
open
(
local_filename
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
json
.
dump
(
data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
if
__name__
==
'__main__'
:
mineru_patterns
=
[
"models/Layout/LayoutLMv3/*"
,
"models/Layout/YOLO/*"
,
"models/MFD/YOLO/*"
,
"models/MFR/unimernet_small/*"
,
"models/TabRec/TableMaster/*"
,
"models/TabRec/StructEqTable/*"
,
]
model_dir
=
snapshot_download
(
'opendatalab/PDF-Extract-Kit-1.0'
,
allow_patterns
=
mineru_patterns
)
layoutreader_pattern
=
[
"*.json"
,
"*.safetensors"
,
]
layoutreader_model_dir
=
snapshot_download
(
'hantian/layoutreader'
,
allow_patterns
=
layoutreader_pattern
)
model_dir
=
model_dir
+
'/models'
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
json_url
=
'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
json_mods
=
{
'models-dir'
:
model_dir
,
'layoutreader-model-dir'
:
layoutreader_model_dir
,
}
download_and_modify_json
(
json_url
,
config_file
,
json_mods
)
print
(
f
'The configuration file has been configured successfully, the path is:
{
config_file
}
'
)
docs/how_to_download_models_en.md
View file @
0f83b568
...
@@ -8,7 +8,7 @@ Use a Python Script to Download Model Files from Hugging Face
...
@@ -8,7 +8,7 @@ Use a Python Script to Download Model Files from Hugging Face
```
bash
```
bash
pip
install
huggingface_hub
pip
install
huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/
doc
s/download_models_hf.py
-O
download_models_hf.py
wget https://github.com/opendatalab/MinerU/raw/master/
script
s/download_models_hf.py
-O
download_models_hf.py
python download_models_hf.py
python download_models_hf.py
```
```
...
...
docs/how_to_download_models_zh_cn.md
View file @
0f83b568
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
<summary>
方法一:从 Hugging Face 下载模型
</summary>
<summary>
方法一:从 Hugging Face 下载模型
</summary>
<p>
使用python脚本 从Hugging Face下载模型文件
</p>
<p>
使用python脚本 从Hugging Face下载模型文件
</p>
<pre><code>
pip install huggingface_hub
<pre><code>
pip install huggingface_hub
wget https://gitee.com/myhloli/MinerU/raw/master/
doc
s/download_models_hf.py -O download_models_hf.py
wget https://gitee.com/myhloli/MinerU/raw/master/
script
s/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
</code></pre>
python download_models_hf.py
</code></pre>
</details>
</details>
...
@@ -18,7 +18,7 @@ python download_models_hf.py</code></pre>
...
@@ -18,7 +18,7 @@ python download_models_hf.py</code></pre>
```
bash
```
bash
pip
install
modelscope
pip
install
modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/
doc
s/download_models.py
-O
download_models.py
wget https://gitee.com/myhloli/MinerU/raw/master/
script
s/download_models.py
-O
download_models.py
python download_models.py
python download_models.py
```
```
...
...
magic-pdf.template.json
View file @
0f83b568
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
"enable"
:
true
"enable"
:
true
},
},
"table-config"
:
{
"table-config"
:
{
"model"
:
"
tablemaster
"
,
"model"
:
"
rapid_table
"
,
"enable"
:
false
,
"enable"
:
false
,
"max_time"
:
400
"max_time"
:
400
},
},
...
...
scripts/download_models.py
View file @
0f83b568
...
@@ -45,7 +45,7 @@ if __name__ == '__main__':
...
@@ -45,7 +45,7 @@ if __name__ == '__main__':
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
json_url
=
'https://gitee.com/myhloli/MinerU/raw/
dev
/magic-pdf.template.json'
json_url
=
'https://gitee.com/myhloli/MinerU/raw/
master
/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
...
...
scripts/download_models_hf.py
View file @
0f83b568
...
@@ -52,7 +52,7 @@ if __name__ == '__main__':
...
@@ -52,7 +52,7 @@ if __name__ == '__main__':
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'model_dir is:
{
model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
print
(
f
'layoutreader_model_dir is:
{
layoutreader_model_dir
}
'
)
json_url
=
'https://github.com/opendatalab/MinerU/raw/
dev
/magic-pdf.template.json'
json_url
=
'https://github.com/opendatalab/MinerU/raw/
master
/magic-pdf.template.json'
config_file_name
=
'magic-pdf.json'
config_file_name
=
'magic-pdf.json'
home_dir
=
os
.
path
.
expanduser
(
'~'
)
home_dir
=
os
.
path
.
expanduser
(
'~'
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
config_file
=
os
.
path
.
join
(
home_dir
,
config_file_name
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment