Commit d890d8fe authored by chenych's avatar chenych
Browse files

Fix params and Update README.

parent 055b6aa1
......@@ -9,7 +9,7 @@ os.environ["HIP_VISIBLE_DEVICES"] = '0'
parse = argparse.ArgumentParser()
parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR')
parse.add_argument('--image_file', type=str, default='./doc/test.jpg')
parse.add_argument('--image_file', type=str, default='./doc/test.png')
parse.add_argument('--output_path', type=str, default='./output/')
args = parse.parse_args()
......
......@@ -21,8 +21,8 @@ MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
INPUT_PATH = ''
OUTPUT_PATH = ''
INPUT_PATH = './doc/test.png'
OUTPUT_PATH = './output'
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.'
......
import asyncio
import re
import os
import argparse
import torch
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ["HIP_VISIBLE_DEVICES"] = '0'
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
......@@ -21,8 +17,6 @@ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
def load_image(image_path):
......@@ -136,21 +130,17 @@ def draw_bounding_boxes(image, refs):
img_draw.paste(overlay, (0, 0), overlay)
return img_draw
def process_image_with_refs(image, ref_texts):
result_image = draw_bounding_boxes(image, ref_texts)
return result_image
async def stream_generate(image=None, prompt=''):
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
block_size=64,
max_model_len=8192,
enforce_eager=False,
trust_remote_code=True,
......
......@@ -3,17 +3,12 @@ import fitz
import img2pdf
import io
import re
from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
from PIL import Image, ImageDraw, ImageFont
......@@ -32,7 +27,7 @@ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
block_size=64,
enforce_eager=False,
trust_remote_code=True,
max_model_len=8192,
......
......@@ -16,10 +16,6 @@ DeepSeek-OCR 的能力范围包括:
- 多语言处理(中英文混合识别)
- 物体定位(grounding 功能支持)
<div align=center>
<img src="./doc/xxx.png"/>
</div>
## 环境配置
### 硬件需求
DCU型号:K100AI,节点数量:1台,卡数:1张。
......@@ -28,7 +24,7 @@ DCU型号:K100AI,节点数量:1台,卡数:1张。
### Docker(方法一)
```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
......@@ -52,7 +48,7 @@ DTK: 25.04.1
python: 3.10.12
torch: 2.5.1+das.opt1.dtk25041
transformers: 4.46.3
vllm: 0.9.2
vllm: 0.8.5
```
`Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其它非深度学习库参照requirements.txt安装:
```bash
......@@ -67,25 +63,25 @@ pip install -r requirements.txt
## 推理
### transformers
模型地址,测试图片路径,输出路径根据实际情况修改。
> 模型地址,测试图片路径,输出路径根据实际情况修改。
```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-hf
python run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.jpg --output_path=./output
python DeepSeek-OCR-hf/run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.png --output_path=./output
```
### vllm
> 模型地址,测试图片路径,输出路径请根据实际情况在`DeepSeek-OCR-vllm/config.py`中修改。
```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-vllm
# image: streaming output
python run_dpsk_ocr_image.py
export VLLM_USE_V1=0
# image:流式输出
python DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
# pdf
python run_dpsk_ocr_pdf.py
python DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
```
## result
<div align=center>
<img src="./doc/xxx.png"/>
<img src="./doc/result_with_boxes_vllm.jpg"/>
</div>
### 精度
......
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
\ No newline at end of file
# 模型唯一标识
modelCode=205
modelCode=1781
# 模型名称
modelName=PaddleOCR_paddle_onnxruntime
modelName=deepseek-ocr_pytorch
# 模型描述
modelDescription=paddleocr_paddle_onnxruntime是一个实现字符检测和识别的模型
modelDescription=DeepSeek 推出了全新的视觉文本压缩模型 DeepSeek-OCR
# 应用场景
appScenario=推理,训练,OCR,制造,金融,交通,教育,医疗
appScenario=推理,OCR,制造,金融,交通,教育,医疗
# 框架类型
frameType=paddle,onnxruntime
frameType=pytorch,vllm
# 加速卡类型
accelerateType=K100AI
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment