"src/lib/vscode:/vscode.git/clone" did not exist on "1ad5456fe0511bd34d8dc0e214aa338954014d88"
Commit b6c39f3b authored by zhougaofeng's avatar zhougaofeng
Browse files

Merge branch 'zhiAn123-main-patch-82168' into 'main'

Update doc/image (1).png, doc/image (10).png, doc/image (2).png, doc/image...

See merge request !1
parents 234deee1 9d156b18
......@@ -50,7 +50,19 @@
<img src="doc/image (10).png"/>
</div>
### 5、启动qwen-ocr模块:
### 5、配置config.ini中的路由地址
vim magic_pdf/config.ini
默认如下:
`pdf_server = http://0.0.0.0:4090`
`ocr_server = http://0.0.0.0:4080`
根据需要,自行配置路由地址
### 6、启动qwen-ocr模块:
修改magic_pdf/magic_pdf/dict2md/ocr_server.py文件中模型路径地址
......@@ -62,20 +74,20 @@
`python magic_pdf/dict2md/ocr_server.py`
默认使用6020端口,0号DCU卡 ,可以通过--dcu_id 指定卡,--server_port指定端口号,-c 指定qwen模型地址
默认使用0号DCU卡 ,可以通过--dcu_id 指定卡,-c 指定qwen模型地址,--config_path 指定config.ini路径
qwen-ocr模块启动成功:
<div align=center>
<img src="doc/image (5).png"/>
</div>
### 5、启动pdf-server解析服务:
### 7、启动pdf-server解析服务:
#### pdf-server解析服务启动代码:
`python magic_pdf/tools/pdf_server.py`
默认使用6030端口,0号DCU卡 ,可以通过--dcu_id 指定卡,--pdf_port指定端口号
默认使用0号DCU卡 ,可以通过--dcu_id 指定卡,--config_path 指定config.ini路径
<div align=center>
<img src="doc/image (6).png"/>
......@@ -86,9 +98,18 @@ qwen-ocr模块启动成功:
<img src="doc/image (7).png"/>
</div>
### 6、解析pdf
### 8、解析pdf
`python magic_pdf/parse/common_parse.py -p [文件/目录 路径] -o [输出地址]`
-p指定pdf路径,-o指定输出路径 --config_path 指定config.ini路径
<div align=center>
<img src="doc/image12.png"/>
</div>
<div align=center>
<img src="doc/image (8).png"/>
</div>
-p指定pdf路径,-o指定输出路径
doc/image (6).png

38.1 KB | W: | H:

doc/image (6).png

42.6 KB | W: | H:

doc/image (6).png
doc/image (6).png
doc/image (6).png
doc/image (6).png
  • 2-up
  • Swipe
  • Onion skin
doc/image11.png

87.4 KB | W: | H:

doc/image11.png

112 KB | W: | H:

doc/image11.png
doc/image11.png
doc/image11.png
doc/image11.png
  • 2-up
  • Swipe
  • Onion skin
[server]
pdf_server = http://0.0.0.0:4090
ocr_server = http://0.0.0.0:4080
import configparser
import re
import time
......@@ -120,11 +121,14 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
return page_markdown
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
mode,
img_buket_path=''):
page_markdown = []
url = 'http://127.0.0.1:6020'
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
logger.info(f'ocr_server:{url}')
client = PredictClient(url)
for para_block in paras_of_layout:
para_text = ''
......@@ -415,7 +419,8 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return content_list
def union_make(pdf_info_dict: list,
def union_make(config_path: str,
pdf_info_dict: list,
make_mode: str,
drop_mode: str,
img_buket_path: str = ''):
......@@ -442,11 +447,11 @@ def union_make(pdf_info_dict: list,
continue
if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
config_path,paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
config_path,paras_of_layout, 'nlp')
output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout:
......
......@@ -2,10 +2,12 @@
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree
import configparser
import copy
import re
import gc
import time
import torch
from argparse import ArgumentParser
from threading import Thread
......@@ -15,6 +17,7 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIte
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
from loguru import logger
app = FastAPI()
......@@ -36,10 +39,11 @@ def _get_args():
help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser', action='store_true', default=False,
help='Automatically launch the interface in a new tab on the default browser.')
parser.add_argument('--server_port', type=int, default=6020, help='Demo server port.')
parser.add_argument('--server_name', type=str, default='127.0.0.1', help='Demo server name.')
parser.add_argument('--dcu_id', type=str, default='0', help='Specify the GPU ID to load the model onto.')
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
)
args = parser.parse_args()
return args
......@@ -232,13 +236,14 @@ async def predict(item: Item):
]
}
]
start = time.time()
generated_text = ''
for response in call_local_model(model, processor, messages):
generated_text = _parse_text(response)
_gc()
end = time.time()
logger.info(f'【{item.image_path}】解析的结果是:{generated_text},耗时为:{end-start}')
return {"Generated Text": generated_text}
......@@ -246,6 +251,12 @@ if __name__ == "__main__":
import uvicorn
args = _get_args()
uvicorn.run(app, host=args.server_name, port=args.server_port)
config = configparser.ConfigParser()
config.read(args.config_path)
ocr_server = config.get('server', 'ocr_server')
if 'http' in ocr_server:
ocr_server = ocr_server.split('://')[1]
host,port = ocr_server.split(':')[0],int(ocr_server.split(':')[1])
uvicorn.run(app, host=host, port=port)
......@@ -56,9 +56,9 @@ class AbsPipe(ABC):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
def pipe_mk_markdown(self,config_path,img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
# logger.info(f'self.get_compress_pdf_mid_data():\n{self.get_compress_pdf_mid_data()}')
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
md_content = AbsPipe.mk_markdown(config_path,self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
return md_content
@staticmethod
......@@ -101,16 +101,13 @@ class AbsPipe(ABC):
return content_list
@staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
def mk_markdown(config_path: str,compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
"""
根据pdf类型,markdown
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"]
logger.info(f'pdf_mid_data:\n{pdf_mid_data}')
logger.info('-'*80)
logger.info(f'pdf_info_list:\n{pdf_info_list}')
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
md_content = union_make(config_path,pdf_info_list, md_make_mode, drop_mode, img_buket_path)
return md_content
......
......@@ -52,11 +52,11 @@ class UNIPipe(AbsPipe):
logger.info("uni_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
def pipe_mk_markdown(self,config_path, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
# logger.info(f'img_parent_path:\n{img_parent_path}')
# logger.info(f'drop_mode:\n{drop_mode}')
# logger.info(f'md_make_mode:\n{md_make_mode}')
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
result = super().pipe_mk_markdown(config_path,img_parent_path, drop_mode, md_make_mode)
logger.info(f"uni_pipe mk {md_make_mode} finished")
return result
......
......@@ -40,12 +40,14 @@ def remove_empty_lines_from_file(file_path):
file.writelines(non_empty_lines)
def do_parse(
config_path,
output_dir,
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
debug_able,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
......@@ -57,6 +59,7 @@ def do_parse(
f_draw_model_bbox=False,
start_page_id=0,
end_page_id=None,
):
if debug_able:
logger.warning('debug mode is on')
......@@ -72,7 +75,7 @@ def do_parse(
image_dir = str(os.path.basename(local_image_dir))
# logger.info(f'model_list:{model_list}')
# logger.info(f'local_image_dir:::{local_image_dir}')
logger.info(f'image_dir:::{image_dir}')
# logger.info(f'image_dir:::{image_dir}')
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
......@@ -108,7 +111,7 @@ def do_parse(
# if f_draw_model_bbox:
# drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(local_image_dir,
md_content = pipe.pipe_mk_markdown(config_path,local_image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
......
# -*- coding: utf-8 -*-
import time
import requests
from loguru import logger
import argparse
import os
class ocrPdfClient:
def __init__(self, api_url):
self.api_url = api_url
def ocr_pdf_client(self, path,output_dir):
payload = {
"path": str(path),
"output_dir": str(output_dir),
}
logger.info(f'pdf路径:{path},输出路径{output_dir}')
response = requests.post(f"{self.api_url}/pdf_ocr", json=payload)
logger.info(f'response:{response}')
if response.status_code == 200:
return output_dir
else:
raise Exception(f"ocrPdf API request failed with status code {response.status_code}")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--url',
default='http://0.0.0.0:6030',
)
parser.add_argument(
'--path',
'-p',
required=True
)
parser.add_argument(
'--output_dir',
'-o',
required=True
)
args = parser.parse_args()
return args
def main():
args = parse_args()
embedder = ocrPdfClient(args.url)
doc_analyze_start = time.time()
if not os.path.isabs(args.output_dir):
current_working_directory = os.getcwd()
output_dir = os.path.join(current_working_directory, args.output_dir)
# logger.info(f'相对路径output_dir:{output_dir}')
else:
output_dir = args.output_dir
logger.info(f'output_dir:{output_dir}')
try:
res = embedder.ocr_pdf_client(path=args.path,output_dir=output_dir)
if res:
logger.info(f"output_dir: '{res}'")
else:
logger.warning("None")
except requests.exceptions.RequestException as e:
logger.error(f"Error while making request to reranker service: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred: {e}")
doc_analyze_cost = time.time() - doc_analyze_start
logger.info(f'解析当前pdf{args.path}耗时为:{doc_analyze_cost}')
if __name__ == "__main__":
main()
......@@ -14,13 +14,15 @@ from argparse import ArgumentParser
from pydantic import BaseModel
import uvicorn
import time
import configparser
#from magic_pdf.tools.config import update_config
app = FastAPI()
method = 'auto'
class ocrRequest(BaseModel):
path: str
output_dir: str
config_path: str
def parse_args():
parser = ArgumentParser()
......@@ -28,10 +30,6 @@ def parse_args():
'--dcu_id',
default='0',
help='设置DCU')
parser.add_argument(
'--pdf_port',
default=6030,
help='设置DCU')
parser.add_argument(
'--method',
type=parse_pdf_methods,
......@@ -42,41 +40,37 @@ def parse_args():
without method specified, auto will be used by default.""",
default = 'auto',
)
# parser.add_argument(
# '--start',
# type=int,
# help='The starting page for PDF parsing, beginning from 0.',
# default=0,
# )
# parser.add_argument(
# '--end',
# type=int,
# help='The ending page for PDF parsing, beginning from 0.',
# default=None,
# )
parser.add_argument(
'--debug',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini')
args = parser.parse_args()
return args
def ocr_pdf_serve(args: str):
os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
uvicorn.run(app, host="0.0.0.0", port=args.pdf_port)
config = configparser.ConfigParser()
config.read(args.config_path)
pdf_server = config.get('server', 'pdf_server')
if 'http' in pdf_server:
pdf_server = pdf_server.split('://')[1]
host,port = pdf_server.split(':')[0],int(pdf_server.split(':')[1])
uvicorn.run(app, host=host, port=port)
@app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
def pdf_ocr(request: ocrRequest):
async def pdf_ocr(request: ocrRequest):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
output_dir = request.output_dir
path = request.path
#config_path = request.config_path
os.makedirs(output_dir, exist_ok=True)
debug_able = False
start_page_id = 0
......@@ -86,11 +80,12 @@ def pdf_ocr(request: ocrRequest):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
def parse_doc(doc_path: str, config_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
config_path,
output_dir,
file_name,
pdf_data,
......@@ -104,21 +99,8 @@ def pdf_ocr(request: ocrRequest):
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
# 查找所有的pdf文件
for file in files:
if file.endswith('.pdf'):
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
start = time.time()
logger.info(f'正在解析:{doc_path}')
parse_doc(doc_path)
end = time.time()
logger.info(f'解析:{doc_path}的耗时为:{end -start}')
else:
logger.info(f'正在解析:{path}')
parse_doc(path)
logger.info(f'config_path:{request.config_path}')
parse_doc(path,request.config_path)
def main():
args = parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment