Commit d890d8fe authored by chenych's avatar chenych
Browse files

Fix params and Update README.

parent 055b6aa1
...@@ -9,7 +9,7 @@ os.environ["HIP_VISIBLE_DEVICES"] = '0' ...@@ -9,7 +9,7 @@ os.environ["HIP_VISIBLE_DEVICES"] = '0'
parse = argparse.ArgumentParser() parse = argparse.ArgumentParser()
parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR') parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR')
parse.add_argument('--image_file', type=str, default='./doc/test.jpg') parse.add_argument('--image_file', type=str, default='./doc/test.png')
parse.add_argument('--output_path', type=str, default='./output/') parse.add_argument('--output_path', type=str, default='./output/')
args = parse.parse_args() args = parse.parse_args()
......
...@@ -21,8 +21,8 @@ MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path ...@@ -21,8 +21,8 @@ MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py; # .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py # Omnidocbench images path: run_dpsk_ocr_eval_batch.py
INPUT_PATH = '' INPUT_PATH = './doc/test.png'
OUTPUT_PATH = '' OUTPUT_PATH = './output'
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.' PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.' # PROMPT = '<image>\nFree OCR.'
......
import asyncio import asyncio
import re import re
import os import os
import argparse
import torch os.environ["HIP_VISIBLE_DEVICES"] = '0'
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from vllm import AsyncLLMEngine, SamplingParams from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
...@@ -21,8 +17,6 @@ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor ...@@ -21,8 +17,6 @@ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor from process.image_process import DeepseekOCRProcessor
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM) ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
def load_image(image_path): def load_image(image_path):
...@@ -136,21 +130,17 @@ def draw_bounding_boxes(image, refs): ...@@ -136,21 +130,17 @@ def draw_bounding_boxes(image, refs):
img_draw.paste(overlay, (0, 0), overlay) img_draw.paste(overlay, (0, 0), overlay)
return img_draw return img_draw
def process_image_with_refs(image, ref_texts): def process_image_with_refs(image, ref_texts):
result_image = draw_bounding_boxes(image, ref_texts) result_image = draw_bounding_boxes(image, ref_texts)
return result_image return result_image
async def stream_generate(image=None, prompt=''): async def stream_generate(image=None, prompt=''):
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=MODEL_PATH, model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]}, hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256, block_size=64,
max_model_len=8192, max_model_len=8192,
enforce_eager=False, enforce_eager=False,
trust_remote_code=True, trust_remote_code=True,
......
...@@ -3,17 +3,12 @@ import fitz ...@@ -3,17 +3,12 @@ import fitz
import img2pdf import img2pdf
import io import io
import re import re
from tqdm import tqdm from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0' os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
...@@ -32,7 +27,7 @@ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM) ...@@ -32,7 +27,7 @@ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM( llm = LLM(
model=MODEL_PATH, model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]}, hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256, block_size=64,
enforce_eager=False, enforce_eager=False,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
......
...@@ -16,10 +16,6 @@ DeepSeek-OCR 的能力范围包括: ...@@ -16,10 +16,6 @@ DeepSeek-OCR 的能力范围包括:
- 多语言处理(中英文混合识别) - 多语言处理(中英文混合识别)
- 物体定位(grounding 功能支持) - 物体定位(grounding 功能支持)
<div align=center>
<img src="./doc/xxx.png"/>
</div>
## 环境配置 ## 环境配置
### 硬件需求 ### 硬件需求
DCU型号:K100AI,节点数量:1台,卡数:1张。 DCU型号:K100AI,节点数量:1台,卡数:1张。
...@@ -28,7 +24,7 @@ DCU型号:K100AI,节点数量:1台,卡数:1张。 ...@@ -28,7 +24,7 @@ DCU型号:K100AI,节点数量:1台,卡数:1张。
### Docker(方法一) ### Docker(方法一)
```bash ```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2 docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
...@@ -52,7 +48,7 @@ DTK: 25.04.1 ...@@ -52,7 +48,7 @@ DTK: 25.04.1
python: 3.10.12 python: 3.10.12
torch: 2.5.1+das.opt1.dtk25041 torch: 2.5.1+das.opt1.dtk25041
transformers: 4.46.3 transformers: 4.46.3
vllm: 0.9.2 vllm: 0.8.5
``` ```
`Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其它非深度学习库参照requirements.txt安装: `Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其它非深度学习库参照requirements.txt安装:
```bash ```bash
...@@ -67,25 +63,25 @@ pip install -r requirements.txt ...@@ -67,25 +63,25 @@ pip install -r requirements.txt
## 推理 ## 推理
### transformers ### transformers
模型地址,测试图片路径,输出路径根据实际情况修改。 > 模型地址,测试图片路径,输出路径根据实际情况修改。
```bash ```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-hf python DeepSeek-OCR-hf/run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.png --output_path=./output
python run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.jpg --output_path=./output
``` ```
### vllm ### vllm
> 模型地址,测试图片路径,输出路径请根据实际情况在`DeepSeek-OCR-vllm/config.py`中修改。
```bash ```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-vllm export VLLM_USE_V1=0
# image: streaming output # image:流式输出
python run_dpsk_ocr_image.py python DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
# pdf # pdf
python run_dpsk_ocr_pdf.py python DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
``` ```
## result ## result
<div align=center> <div align=center>
<img src="./doc/xxx.png"/> <img src="./doc/result_with_boxes_vllm.jpg"/>
</div> </div>
### 精度 ### 精度
......
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2 FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
\ No newline at end of file \ No newline at end of file
# 模型唯一标识 # 模型唯一标识
modelCode=205 modelCode=1781
# 模型名称 # 模型名称
modelName=PaddleOCR_paddle_onnxruntime modelName=deepseek-ocr_pytorch
# 模型描述 # 模型描述
modelDescription=paddleocr_paddle_onnxruntime是一个实现字符检测和识别的模型 modelDescription=DeepSeek 推出了全新的视觉文本压缩模型 DeepSeek-OCR
# 应用场景 # 应用场景
appScenario=推理,训练,OCR,制造,金融,交通,教育,医疗 appScenario=推理,OCR,制造,金融,交通,教育,医疗
# 框架类型 # 框架类型
frameType=paddle,onnxruntime frameType=pytorch,vllm
# 加速卡类型 # 加速卡类型
accelerateType=K100AI accelerateType=K100AI
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment