Commit d890d8fe authored by chenych's avatar chenych
Browse files

Fix params and Update README.

parent 055b6aa1
......@@ -9,7 +9,7 @@ os.environ["HIP_VISIBLE_DEVICES"] = '0'
parse = argparse.ArgumentParser()
parse.add_argument('--model_name_or_path', type=str, default='deepseek-ai/DeepSeek-OCR')
parse.add_argument('--image_file', type=str, default='./doc/test.jpg')
parse.add_argument('--image_file', type=str, default='./doc/test.png')
parse.add_argument('--output_path', type=str, default='./output/')
args = parse.parse_args()
......
......@@ -11,18 +11,18 @@ CROP_MODE = True
MIN_CROPS= 2
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
PRINT_NUM_VIS_TOKENS = False
SKIP_REPEAT = True
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
# TODO: change INPUT_PATH
# .pdf: run_dpsk_ocr_pdf.py;
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# .pdf: run_dpsk_ocr_pdf.py;
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
INPUT_PATH = ''
OUTPUT_PATH = ''
INPUT_PATH = './doc/test.png'
OUTPUT_PATH = './output'
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.'
......
import asyncio
import re
import os
import argparse
import torch
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ["HIP_VISIBLE_DEVICES"] = '0'
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
......@@ -21,19 +17,17 @@ from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCRProcessor
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, CROP_MODE
ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
def load_image(image_path):
try:
image = Image.open(image_path)
corrected_image = ImageOps.exif_transpose(image)
return corrected_image
except Exception as e:
print(f"error: {e}")
try:
......@@ -78,18 +72,18 @@ def draw_bounding_boxes(image, refs):
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw2 = ImageDraw.Draw(overlay)
# except IOError:
font = ImageFont.load_default()
img_idx = 0
for i, ref in enumerate(refs):
try:
result = extract_coordinates_and_label(ref, image_width, image_height)
if result:
label_type, points_list = result
color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
color_a = color + (20, )
......@@ -110,7 +104,7 @@ def draw_bounding_boxes(image, refs):
print(e)
pass
img_idx += 1
try:
if label_type == 'title':
draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
......@@ -121,13 +115,13 @@ def draw_bounding_boxes(image, refs):
text_x = x1
text_y = max(0, y1 - 15)
text_bbox = draw.textbbox((0, 0), label_type, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
fill=(255, 255, 255, 30))
draw.text((text_x, text_y), label_type, font=font, fill=color)
except:
pass
......@@ -136,30 +130,26 @@ def draw_bounding_boxes(image, refs):
img_draw.paste(overlay, (0, 0), overlay)
return img_draw
def process_image_with_refs(image, ref_texts):
result_image = draw_bounding_boxes(image, ref_texts)
return result_image
async def stream_generate(image=None, prompt=''):
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
block_size=64,
max_model_len=8192,
enforce_eager=False,
trust_remote_code=True,
trust_remote_code=True,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>
logits_processors = [NoRepeatNGramLogitsProcessor(ngram_size=30, window_size=90, whitelist_token_ids= {128821, 128822})] #whitelist: <td>, </td>
sampling_params = SamplingParams(
temperature=0.0,
......@@ -167,12 +157,12 @@ async def stream_generate(image=None, prompt=''):
logits_processors=logits_processors,
skip_special_tokens=False,
# ignore_eos=False,
)
request_id = f"request-{int(time.time())}"
printed_length = 0
printed_length = 0
if image and '<image>' in prompt:
request = {
......@@ -194,7 +184,7 @@ async def stream_generate(image=None, prompt=''):
print(new_text, end='', flush=True)
printed_length = len(full_text)
final_output = full_text
print('\n')
print('\n')
return final_output
......@@ -208,7 +198,7 @@ if __name__ == "__main__":
image = load_image(INPUT_PATH).convert('RGB')
if '<image>' in PROMPT:
image_features = DeepseekOCRProcessor().tokenize_with_images(images = [image], bos=True, eos=True, cropping=CROP_MODE)
......@@ -281,9 +271,9 @@ if __name__ == "__main__":
label = endpoint.split(': ')[0]
(x, y) = eval(endpoint.split(': ')[1])
ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
fontsize=5, fontweight='light')
try:
if 'Circle' in eval(outputs).keys():
circle_centers = eval(outputs)['Circle']['circle_center']
......
......@@ -3,17 +3,12 @@ import fitz
import img2pdf
import io
import re
from tqdm import tqdm
import torch
from concurrent.futures import ThreadPoolExecutor
if torch.version.cuda == '11.8':
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda-11.8/bin/ptxas"
os.environ['VLLM_USE_V1'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
from config import MODEL_PATH, INPUT_PATH, OUTPUT_PATH, PROMPT, SKIP_REPEAT, MAX_CONCURRENCY, NUM_WORKERS, CROP_MODE
from PIL import Image, ImageDraw, ImageFont
......@@ -32,9 +27,9 @@ ModelRegistry.register_model("DeepseekOCRForCausalLM", DeepseekOCRForCausalLM)
llm = LLM(
model=MODEL_PATH,
hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
block_size=256,
block_size=64,
enforce_eager=False,
trust_remote_code=True,
trust_remote_code=True,
max_model_len=8192,
swap_space=0,
max_num_seqs=MAX_CONCURRENCY,
......@@ -59,19 +54,19 @@ class Colors:
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
RESET = '\033[0m'
RESET = '\033[0m'
def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
"""
pdf2images
"""
images = []
pdf_document = fitz.open(pdf_path)
zoom = dpi / 72.0
matrix = fitz.Matrix(zoom, zoom)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
......@@ -88,9 +83,9 @@ def pdf_to_images_high_quality(pdf_path, dpi=144, image_format="PNG"):
background = Image.new('RGB', img.size, (255, 255, 255))
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = background
images.append(img)
pdf_document.close()
return images
......@@ -98,18 +93,18 @@ def pil_to_pdf_img2pdf(pil_images, output_path):
if not pil_images:
return
image_bytes_list = []
for img in pil_images:
if img.mode != 'RGB':
img = img.convert('RGB')
img_buffer = io.BytesIO()
img.save(img_buffer, format='JPEG', quality=95)
img_bytes = img_buffer.getvalue()
image_bytes_list.append(img_bytes)
try:
pdf_bytes = img2pdf.convert(image_bytes_list)
with open(output_path, "wb") as f:
......@@ -156,18 +151,18 @@ def draw_bounding_boxes(image, refs, jdx):
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw2 = ImageDraw.Draw(overlay)
# except IOError:
font = ImageFont.load_default()
img_idx = 0
for i, ref in enumerate(refs):
try:
result = extract_coordinates_and_label(ref, image_width, image_height)
if result:
label_type, points_list = result
color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
color_a = color + (20, )
......@@ -188,7 +183,7 @@ def draw_bounding_boxes(image, refs, jdx):
print(e)
pass
img_idx += 1
try:
if label_type == 'title':
draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
......@@ -199,13 +194,13 @@ def draw_bounding_boxes(image, refs, jdx):
text_x = x1
text_y = max(0, y1 - 15)
text_bbox = draw.textbbox((0, 0), label_type, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
fill=(255, 255, 255, 30))
draw.text((text_x, text_y), label_type, font=font, fill=color)
except:
pass
......@@ -234,7 +229,7 @@ if __name__ == "__main__":
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/images', exist_ok=True)
print(f'{Colors.RED}PDF loading .....{Colors.RESET}')
......@@ -245,7 +240,7 @@ if __name__ == "__main__":
# batch_inputs = []
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
batch_inputs = list(tqdm(
executor.map(process_single_image, images),
total=len(images),
......@@ -292,7 +287,7 @@ if __name__ == "__main__":
if SKIP_REPEAT:
continue
page_num = f'\n<--- Page Split --->'
contents_det += content + f'\n{page_num}\n'
......
......@@ -16,10 +16,6 @@ DeepSeek-OCR 的能力范围包括:
- 多语言处理(中英文混合识别)
- 物体定位(grounding 功能支持)
<div align=center>
<img src="./doc/xxx.png"/>
</div>
## 环境配置
### 硬件需求
DCU型号:K100AI,节点数量:1台,卡数:1张。
......@@ -28,7 +24,7 @@ DCU型号:K100AI,节点数量:1台,卡数:1张。
### Docker(方法一)
```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
......@@ -52,7 +48,7 @@ DTK: 25.04.1
python: 3.10.12
torch: 2.5.1+das.opt1.dtk25041
transformers: 4.46.3
vllm: 0.9.2
vllm: 0.8.5
```
`Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其它非深度学习库参照requirements.txt安装:
```bash
......@@ -67,25 +63,25 @@ pip install -r requirements.txt
## 推理
### transformers
模型地址,测试图片路径,输出路径根据实际情况修改。
> 模型地址,测试图片路径,输出路径根据实际情况修改。
```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-hf
python run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.jpg --output_path=./output
python DeepSeek-OCR-hf/run_dpsk_ocr.py --model_name_or_path=deepseek-ai/DeepSeek-OCR --image_path=./doc/test.png --output_path=./output
```
### vllm
> 模型地址,测试图片路径,输出路径请根据实际情况在`DeepSeek-OCR-vllm/config.py`中修改。
```bash
cd DeepSeek-OCR-master/DeepSeek-OCR-vllm
# image: streaming output
python run_dpsk_ocr_image.py
export VLLM_USE_V1=0
# image:流式输出
python DeepSeek-OCR-vllm/run_dpsk_ocr_image.py
# pdf
python run_dpsk_ocr_pdf.py
python DeepSeek-OCR-vllm/run_dpsk_ocr_pdf.py
```
## result
<div align=center>
<img src="./doc/xxx.png"/>
<img src="./doc/result_with_boxes_vllm.jpg"/>
</div>
### 精度
......
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
\ No newline at end of file
# 模型唯一标识
modelCode=205
modelCode=1781
# 模型名称
modelName=PaddleOCR_paddle_onnxruntime
modelName=deepseek-ocr_pytorch
# 模型描述
modelDescription=paddleocr_paddle_onnxruntime是一个实现字符检测和识别的模型
modelDescription=DeepSeek 推出了全新的视觉文本压缩模型 DeepSeek-OCR
# 应用场景
appScenario=推理,训练,OCR,制造,金融,交通,教育,医疗
appScenario=推理,OCR,制造,金融,交通,教育,医疗
# 框架类型
frameType=paddle,onnxruntime
frameType=pytorch,vllm
# 加速卡类型
accelerateType=K100AI
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment