Unverified Commit 0f83b568 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #922 from myhloli/dev

refactor(model download script)
parents fd646101 9496c6c4
...@@ -243,7 +243,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However, ...@@ -243,7 +243,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
# Acknowledgments # Acknowledgments
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy) - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
- [RapidTable](https://github.com/RapidAI/RapidTable)
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
- [layoutreader](https://github.com/ppaanngggg/layoutreader) - [layoutreader](https://github.com/ppaanngggg/layoutreader)
......
...@@ -253,7 +253,9 @@ TODO ...@@ -253,7 +253,9 @@ TODO
# Acknowledgments # Acknowledgments
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy) - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
- [RapidTable](https://github.com/RapidAI/RapidTable)
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
- [layoutreader](https://github.com/ppaanngggg/layoutreader) - [layoutreader](https://github.com/ppaanngggg/layoutreader)
......
import json
import os
import requests
from modelscope import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
import json
import os
import requests
from huggingface_hub import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.0.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_pattern = [
"*.json",
"*.safetensors",
]
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
...@@ -8,7 +8,7 @@ Use a Python Script to Download Model Files from Hugging Face ...@@ -8,7 +8,7 @@ Use a Python Script to Download Model Files from Hugging Face
```bash ```bash
pip install huggingface_hub pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py python download_models_hf.py
``` ```
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
<summary>方法一:从 Hugging Face 下载模型</summary> <summary>方法一:从 Hugging Face 下载模型</summary>
<p>使用python脚本 从Hugging Face下载模型文件</p> <p>使用python脚本 从Hugging Face下载模型文件</p>
<pre><code>pip install huggingface_hub <pre><code>pip install huggingface_hub
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py</code></pre> python download_models_hf.py</code></pre>
</details> </details>
...@@ -18,7 +18,7 @@ python download_models_hf.py</code></pre> ...@@ -18,7 +18,7 @@ python download_models_hf.py</code></pre>
```bash ```bash
pip install modelscope pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
python download_models.py python download_models.py
``` ```
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
"enable": true "enable": true
}, },
"table-config": { "table-config": {
"model": "tablemaster", "model": "rapid_table",
"enable": false, "enable": false,
"max_time": 400 "max_time": 400
}, },
......
...@@ -45,7 +45,7 @@ if __name__ == '__main__': ...@@ -45,7 +45,7 @@ if __name__ == '__main__':
print(f'model_dir is: {model_dir}') print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json' config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~') home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name) config_file = os.path.join(home_dir, config_file_name)
......
...@@ -52,7 +52,7 @@ if __name__ == '__main__': ...@@ -52,7 +52,7 @@ if __name__ == '__main__':
print(f'model_dir is: {model_dir}') print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json' config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~') home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name) config_file = os.path.join(home_dir, config_file_name)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment