Commit b6c39f3b authored by zhougaofeng's avatar zhougaofeng
Browse files

Merge branch 'zhiAn123-main-patch-82168' into 'main'

Update doc/image (1).png, doc/image (10).png, doc/image (2).png, doc/image...

See merge request !1
parents 234deee1 9d156b18
...@@ -50,7 +50,19 @@ ...@@ -50,7 +50,19 @@
<img src="doc/image (10).png"/> <img src="doc/image (10).png"/>
</div> </div>
### 5、启动qwen-ocr模块: ### 5、配置config.ini中的路由地址
vim magic_pdf/config.ini
默认如下:
`pdf_server = http://0.0.0.0:4090`
`ocr_server = http://0.0.0.0:4080`
根据需要,自行配置路由地址
### 6、启动qwen-ocr模块:
修改magic_pdf/magic_pdf/dict2md/ocr_server.py文件中模型路径地址 修改magic_pdf/magic_pdf/dict2md/ocr_server.py文件中模型路径地址
...@@ -62,20 +74,20 @@ ...@@ -62,20 +74,20 @@
`python magic_pdf/dict2md/ocr_server.py` `python magic_pdf/dict2md/ocr_server.py`
默认使用6020端口,0号DCU卡 ,可以通过--dcu_id 指定卡,--server_port指定端口号,-c 指定qwen模型地址 默认使用0号DCU卡 ,可以通过--dcu_id 指定卡,-c 指定qwen模型地址,--config_path 指定config.ini路径
qwen-ocr模块启动成功: qwen-ocr模块启动成功:
<div align=center> <div align=center>
<img src="doc/image (5).png"/> <img src="doc/image (5).png"/>
</div> </div>
### 5、启动pdf-server解析服务: ### 7、启动pdf-server解析服务:
#### pdf-server解析服务启动代码: #### pdf-server解析服务启动代码:
`python magic_pdf/tools/pdf_server.py` `python magic_pdf/tools/pdf_server.py`
默认使用6030端口,0号DCU卡 ,可以通过--dcu_id 指定卡,--pdf_port指定端口号 默认使用0号DCU卡 ,可以通过--dcu_id 指定卡,--config_path 指定config.ini路径
<div align=center> <div align=center>
<img src="doc/image (6).png"/> <img src="doc/image (6).png"/>
...@@ -86,9 +98,18 @@ qwen-ocr模块启动成功: ...@@ -86,9 +98,18 @@ qwen-ocr模块启动成功:
<img src="doc/image (7).png"/> <img src="doc/image (7).png"/>
</div> </div>
### 6、解析pdf ### 8、解析pdf
`python magic_pdf/parse/common_parse.py -p [文件/目录 路径] -o [输出地址]` `python magic_pdf/parse/common_parse.py -p [文件/目录 路径] -o [输出地址]`
-p指定pdf路径,-o指定输出路径 --config_path 指定config.ini路径
<div align=center>
<img src="doc/image12.png"/>
</div>
<div align=center> <div align=center>
<img src="doc/image (8).png"/> <img src="doc/image (8).png"/>
</div> </div>
-p指定pdf路径,-o指定输出路径
doc/image (6).png

38.1 KB | W: | H:

doc/image (6).png

42.6 KB | W: | H:

doc/image (6).png
doc/image (6).png
doc/image (6).png
doc/image (6).png
  • 2-up
  • Swipe
  • Onion skin
doc/image11.png

87.4 KB | W: | H:

doc/image11.png

112 KB | W: | H:

doc/image11.png
doc/image11.png
doc/image11.png
doc/image11.png
  • 2-up
  • Swipe
  • Onion skin
[server]
pdf_server = http://0.0.0.0:4090
ocr_server = http://0.0.0.0:4080
import configparser
import re import re
import time import time
...@@ -120,11 +121,14 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''): ...@@ -120,11 +121,14 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
return page_markdown return page_markdown
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
mode, mode,
img_buket_path=''): img_buket_path=''):
page_markdown = [] page_markdown = []
url = 'http://127.0.0.1:6020' config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
logger.info(f'ocr_server:{url}')
client = PredictClient(url) client = PredictClient(url)
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_text = '' para_text = ''
...@@ -415,7 +419,8 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list): ...@@ -415,7 +419,8 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return content_list return content_list
def union_make(pdf_info_dict: list, def union_make(config_path: str,
pdf_info_dict: list,
make_mode: str, make_mode: str,
drop_mode: str, drop_mode: str,
img_buket_path: str = ''): img_buket_path: str = ''):
...@@ -442,11 +447,11 @@ def union_make(pdf_info_dict: list, ...@@ -442,11 +447,11 @@ def union_make(pdf_info_dict: list,
continue continue
if make_mode == MakeMode.MM_MD: if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path) config_path,paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD: elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp') config_path,paras_of_layout, 'nlp')
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
......
...@@ -2,10 +2,12 @@ ...@@ -2,10 +2,12 @@
# #
# This source code is licensed under the license found in the # This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree # LICENSE file in the root directory of this source tree
import configparser
import copy import copy
import re import re
import gc import gc
import time
import torch import torch
from argparse import ArgumentParser from argparse import ArgumentParser
from threading import Thread from threading import Thread
...@@ -15,6 +17,7 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIte ...@@ -15,6 +17,7 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIte
from fastapi import FastAPI from fastapi import FastAPI
from pydantic import BaseModel from pydantic import BaseModel
from typing import Optional from typing import Optional
from loguru import logger
app = FastAPI() app = FastAPI()
...@@ -36,10 +39,11 @@ def _get_args(): ...@@ -36,10 +39,11 @@ def _get_args():
help='Create a publicly shareable link for the interface.') help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser', action='store_true', default=False, parser.add_argument('--inbrowser', action='store_true', default=False,
help='Automatically launch the interface in a new tab on the default browser.') help='Automatically launch the interface in a new tab on the default browser.')
parser.add_argument('--server_port', type=int, default=6020, help='Demo server port.')
parser.add_argument('--server_name', type=str, default='127.0.0.1', help='Demo server name.')
parser.add_argument('--dcu_id', type=str, default='0', help='Specify the GPU ID to load the model onto.') parser.add_argument('--dcu_id', type=str, default='0', help='Specify the GPU ID to load the model onto.')
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -232,13 +236,14 @@ async def predict(item: Item): ...@@ -232,13 +236,14 @@ async def predict(item: Item):
] ]
} }
] ]
start = time.time()
generated_text = '' generated_text = ''
for response in call_local_model(model, processor, messages): for response in call_local_model(model, processor, messages):
generated_text = _parse_text(response) generated_text = _parse_text(response)
_gc() _gc()
end = time.time()
logger.info(f'【{item.image_path}】解析的结果是:{generated_text},耗时为:{end-start}')
return {"Generated Text": generated_text} return {"Generated Text": generated_text}
...@@ -246,6 +251,12 @@ if __name__ == "__main__": ...@@ -246,6 +251,12 @@ if __name__ == "__main__":
import uvicorn import uvicorn
args = _get_args() args = _get_args()
uvicorn.run(app, host=args.server_name, port=args.server_port) config = configparser.ConfigParser()
config.read(args.config_path)
ocr_server = config.get('server', 'ocr_server')
if 'http' in ocr_server:
ocr_server = ocr_server.split('://')[1]
host,port = ocr_server.split(':')[0],int(ocr_server.split(':')[1])
uvicorn.run(app, host=host, port=port)
...@@ -56,9 +56,9 @@ class AbsPipe(ABC): ...@@ -56,9 +56,9 @@ class AbsPipe(ABC):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list return content_list
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): def pipe_mk_markdown(self,config_path,img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
# logger.info(f'self.get_compress_pdf_mid_data():\n{self.get_compress_pdf_mid_data()}') # logger.info(f'self.get_compress_pdf_mid_data():\n{self.get_compress_pdf_mid_data()}')
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode) md_content = AbsPipe.mk_markdown(config_path,self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
return md_content return md_content
@staticmethod @staticmethod
...@@ -101,16 +101,13 @@ class AbsPipe(ABC): ...@@ -101,16 +101,13 @@ class AbsPipe(ABC):
return content_list return content_list
@staticmethod @staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list: def mk_markdown(config_path: str,compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
""" """
根据pdf类型,markdown 根据pdf类型,markdown
""" """
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"] pdf_info_list = pdf_mid_data["pdf_info"]
logger.info(f'pdf_mid_data:\n{pdf_mid_data}') md_content = union_make(config_path,pdf_info_list, md_make_mode, drop_mode, img_buket_path)
logger.info('-'*80)
logger.info(f'pdf_info_list:\n{pdf_info_list}')
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
return md_content return md_content
......
...@@ -52,11 +52,11 @@ class UNIPipe(AbsPipe): ...@@ -52,11 +52,11 @@ class UNIPipe(AbsPipe):
logger.info("uni_pipe mk content list finished") logger.info("uni_pipe mk content list finished")
return result return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): def pipe_mk_markdown(self,config_path, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
# logger.info(f'img_parent_path:\n{img_parent_path}') # logger.info(f'img_parent_path:\n{img_parent_path}')
# logger.info(f'drop_mode:\n{drop_mode}') # logger.info(f'drop_mode:\n{drop_mode}')
# logger.info(f'md_make_mode:\n{md_make_mode}') # logger.info(f'md_make_mode:\n{md_make_mode}')
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) result = super().pipe_mk_markdown(config_path,img_parent_path, drop_mode, md_make_mode)
logger.info(f"uni_pipe mk {md_make_mode} finished") logger.info(f"uni_pipe mk {md_make_mode} finished")
return result return result
......
...@@ -40,12 +40,14 @@ def remove_empty_lines_from_file(file_path): ...@@ -40,12 +40,14 @@ def remove_empty_lines_from_file(file_path):
file.writelines(non_empty_lines) file.writelines(non_empty_lines)
def do_parse( def do_parse(
config_path,
output_dir, output_dir,
pdf_file_name, pdf_file_name,
pdf_bytes, pdf_bytes,
model_list, model_list,
parse_method, parse_method,
debug_able, debug_able,
f_draw_span_bbox=True, f_draw_span_bbox=True,
f_draw_layout_bbox=True, f_draw_layout_bbox=True,
f_dump_md=True, f_dump_md=True,
...@@ -57,6 +59,7 @@ def do_parse( ...@@ -57,6 +59,7 @@ def do_parse(
f_draw_model_bbox=False, f_draw_model_bbox=False,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
): ):
if debug_able: if debug_able:
logger.warning('debug mode is on') logger.warning('debug mode is on')
...@@ -72,7 +75,7 @@ def do_parse( ...@@ -72,7 +75,7 @@ def do_parse(
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
# logger.info(f'model_list:{model_list}') # logger.info(f'model_list:{model_list}')
# logger.info(f'local_image_dir:::{local_image_dir}') # logger.info(f'local_image_dir:::{local_image_dir}')
logger.info(f'image_dir:::{image_dir}') # logger.info(f'image_dir:::{image_dir}')
if parse_method == 'auto': if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list} jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True, pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
...@@ -108,7 +111,7 @@ def do_parse( ...@@ -108,7 +111,7 @@ def do_parse(
# if f_draw_model_bbox: # if f_draw_model_bbox:
# drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name) # drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(local_image_dir, md_content = pipe.pipe_mk_markdown(config_path,local_image_dir,
drop_mode=DropMode.NONE, drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode) md_make_mode=f_make_md_mode)
......
# -*- coding: utf-8 -*-
import time
import requests
from loguru import logger
import argparse
import os
class ocrPdfClient:
def __init__(self, api_url):
self.api_url = api_url
def ocr_pdf_client(self, path,output_dir):
payload = {
"path": str(path),
"output_dir": str(output_dir),
}
logger.info(f'pdf路径:{path},输出路径{output_dir}')
response = requests.post(f"{self.api_url}/pdf_ocr", json=payload)
logger.info(f'response:{response}')
if response.status_code == 200:
return output_dir
else:
raise Exception(f"ocrPdf API request failed with status code {response.status_code}")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--url',
default='http://0.0.0.0:6030',
)
parser.add_argument(
'--path',
'-p',
required=True
)
parser.add_argument(
'--output_dir',
'-o',
required=True
)
args = parser.parse_args()
return args
def main():
args = parse_args()
embedder = ocrPdfClient(args.url)
doc_analyze_start = time.time()
if not os.path.isabs(args.output_dir):
current_working_directory = os.getcwd()
output_dir = os.path.join(current_working_directory, args.output_dir)
# logger.info(f'相对路径output_dir:{output_dir}')
else:
output_dir = args.output_dir
logger.info(f'output_dir:{output_dir}')
try:
res = embedder.ocr_pdf_client(path=args.path,output_dir=output_dir)
if res:
logger.info(f"output_dir: '{res}'")
else:
logger.warning("None")
except requests.exceptions.RequestException as e:
logger.error(f"Error while making request to reranker service: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred: {e}")
doc_analyze_cost = time.time() - doc_analyze_start
logger.info(f'解析当前pdf{args.path}耗时为:{doc_analyze_cost}')
if __name__ == "__main__":
main()
...@@ -14,13 +14,15 @@ from argparse import ArgumentParser ...@@ -14,13 +14,15 @@ from argparse import ArgumentParser
from pydantic import BaseModel from pydantic import BaseModel
import uvicorn import uvicorn
import time import time
import configparser
#from magic_pdf.tools.config import update_config
app = FastAPI() app = FastAPI()
method = 'auto' method = 'auto'
class ocrRequest(BaseModel): class ocrRequest(BaseModel):
path: str path: str
output_dir: str output_dir: str
config_path: str
def parse_args(): def parse_args():
parser = ArgumentParser() parser = ArgumentParser()
...@@ -28,10 +30,6 @@ def parse_args(): ...@@ -28,10 +30,6 @@ def parse_args():
'--dcu_id', '--dcu_id',
default='0', default='0',
help='设置DCU') help='设置DCU')
parser.add_argument(
'--pdf_port',
default=6030,
help='设置DCU')
parser.add_argument( parser.add_argument(
'--method', '--method',
type=parse_pdf_methods, type=parse_pdf_methods,
...@@ -42,41 +40,37 @@ def parse_args(): ...@@ -42,41 +40,37 @@ def parse_args():
without method specified, auto will be used by default.""", without method specified, auto will be used by default.""",
default = 'auto', default = 'auto',
) )
# parser.add_argument(
# '--start',
# type=int,
# help='The starting page for PDF parsing, beginning from 0.',
# default=0,
# )
# parser.add_argument(
# '--end',
# type=int,
# help='The ending page for PDF parsing, beginning from 0.',
# default=None,
# )
parser.add_argument( parser.add_argument(
'--debug', '--debug',
type=bool, type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.', help='Enables detailed debugging information during the execution of the CLI commands.',
default=False, default=False,
) )
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini')
args = parser.parse_args() args = parser.parse_args()
return args return args
def ocr_pdf_serve(args: str): def ocr_pdf_serve(args: str):
os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
uvicorn.run(app, host="0.0.0.0", port=args.pdf_port) config = configparser.ConfigParser()
config.read(args.config_path)
pdf_server = config.get('server', 'pdf_server')
if 'http' in pdf_server:
pdf_server = pdf_server.split('://')[1]
host,port = pdf_server.split(':')[0],int(pdf_server.split(':')[1])
uvicorn.run(app, host=host, port=port)
@app.post("/pdf_ocr") @app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id): # def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
def pdf_ocr(request: ocrRequest): async def pdf_ocr(request: ocrRequest):
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full' model_config.__model_mode__ = 'full'
output_dir = request.output_dir output_dir = request.output_dir
path = request.path path = request.path
#config_path = request.config_path
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
debug_able = False debug_able = False
start_page_id = 0 start_page_id = 0
...@@ -86,11 +80,12 @@ def pdf_ocr(request: ocrRequest): ...@@ -86,11 +80,12 @@ def pdf_ocr(request: ocrRequest):
disk_rw = DiskReaderWriter(os.path.dirname(path)) disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str): def parse_doc(doc_path: str, config_path: str):
try: try:
file_name = str(Path(doc_path).stem) file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
do_parse( do_parse(
config_path,
output_dir, output_dir,
file_name, file_name,
pdf_data, pdf_data,
...@@ -104,21 +99,8 @@ def pdf_ocr(request: ocrRequest): ...@@ -104,21 +99,8 @@ def pdf_ocr(request: ocrRequest):
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
if os.path.isdir(path): logger.info(f'config_path:{request.config_path}')
for root, dirs, files in os.walk(path): parse_doc(path,request.config_path)
# 查找所有的pdf文件
for file in files:
if file.endswith('.pdf'):
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
start = time.time()
logger.info(f'正在解析:{doc_path}')
parse_doc(doc_path)
end = time.time()
logger.info(f'解析:{doc_path}的耗时为:{end -start}')
else:
logger.info(f'正在解析:{path}')
parse_doc(path)
def main(): def main():
args = parse_args() args = parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment