Commit 6ad287f7 authored by liuxu3's avatar liuxu3
Browse files

added DeepSeek OCR API by liushengtong

parent 80c11a03
import torch.nn as nn
import torch
import torch.nn.functional as F
import copy
class MlpProjector(nn.Module):
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
if cfg.projector_type == "identity":
modules = nn.Identity()
elif cfg.projector_type == "linear":
modules = nn.Linear(cfg.input_dim, cfg.n_embed)
elif cfg.projector_type == "mlp_gelu":
mlp_depth = cfg.get("depth", 1)
modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "normlayer_downsample_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
mlp_ratio = cfg.get("mlp_ratio", 1)
modules = [
nn.LayerNorm(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio),
nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
]
for _ in range(1, mlp_depth - 1):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "downsample_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
mlp_ratio = cfg.get("mlp_ratio", 1)
modules = [nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)]
for _ in range(1, mlp_depth - 1):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "hybrid_split_feature_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
channel_div = cfg.get("channel_div", 0.5)
self.high_up_proj = nn.Linear(cfg.input_dim[0], int(cfg.n_embed * channel_div))
self.low_up_proj = nn.Linear(cfg.input_dim[1], cfg.n_embed - int(cfg.n_embed * channel_div))
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
modules = nn.Sequential(*modules)
elif cfg.projector_type == "low_high_split_mlp_gelu":
mlp_depth = cfg.get("depth", 1)
modules = []
for _ in range(1, mlp_depth):
modules.append(nn.GELU())
modules.append(nn.Linear(cfg.n_embed // 2, cfg.n_embed // 2))
modules = nn.Sequential(*modules)
self.high_layers = nn.Sequential(*modules)
self.low_layers = copy.deepcopy(modules)
else:
raise ValueError(f"Unknown projector type: {cfg.projector_type}")
if cfg.get("token_pooling", False):
self.token_pooling_layer = nn.Linear(cfg.input_dim * 4, cfg.input_dim)
if cfg.get("conv_fusion_high_low_features", False):
self.fusion_layer = nn.Linear(cfg.input_dim, cfg.input_dim)
self.layers = modules
def forward(self, x):
if self.cfg.get("token_pooling", False):
batch_size, wxh, channels = x.shape
w = h = int(wxh**0.5)
x = x.view(batch_size, w, h, channels)
x = x.permute(0, 3, 1, 2)
# import ipdb; ipdb.set_trace()
patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
batch_size, channels, h_patches, w_patches, _, _ = patches.size()
# 在通道维度上拼接
patches = patches.contiguous().view(batch_size, channels, h_patches * w_patches, -1)
# 通过线性层
patches = patches.permute(0, 2, 1, 3).contiguous()
patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
x = self.token_pooling_layer(patches)
if self.cfg.get("conv_fusion_high_low_features", False):
x = self.fusion_layer(x[:, 0]) + x[:, 1]
if self.cfg.projector_type == 'low_high_hybrid_split_mlp_gelu':
high_x, low_x = x[0], x[1]
high_x = self.high_up_proj(high_x)
low_x = self.low_up_proj(low_x)
x = torch.concat([high_x, low_x], dim=-1)
if self.cfg.projector_type == 'hybrid_split_feature_mlp_gelu':
high_x = x[...,:self.cfg.input_dim[0]]
low_x = x[...,self.cfg.input_dim[0]:]
high_x = self.high_up_proj(high_x)
low_x = self.low_up_proj(low_x)
x = torch.concat([high_x, low_x], dim=-1)
if self.cfg.projector_type == 'low_high_split_mlp_gelu':
high_x, low_x = x[0], x[1]
high_x = self.high_layers(high_x)
low_x = self.low_layers(low_x)
x = torch.concat([high_x, low_x], dim=-1)
return x
if self.cfg.projector_type == 'downsample_mlp_gelu' or self.cfg.projector_type == 'normlayer_downsample_mlp_gelu':
bs, hw, input_dim = x.shape
h = w = int((hw) ** 0.5)
"""compute padding"""
if h % self.cfg.downsample_ratio:
pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
else:
pad = 0
x = x.reshape(bs, h, w, input_dim)
if pad > 0:
x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
"""4 to 1 concat"""
x = x.permute(0, 3, 1, 2) # B, C, H, W
x = F.unfold(x, kernel_size=self.cfg.downsample_ratio, stride=self.cfg.downsample_ratio, padding=0) # B, C*4, HW // 4
x = x.permute(0, 2, 1)
return self.layers(x)
@staticmethod
def get_flops_per_sample(cfg):
if cfg.projector_type == "linear":
fwd = 2 * cfg.input_dim * cfg.n_embed
elif "mlp_gelu" in cfg.projector_type :
mlp_depth = cfg.get("depth", 1)
downsample_ratio = cfg.get("downsample_ratio", 1)
input_dim = sum(cfg.input_dim) if isinstance(cfg.input_dim, list) else cfg.input_dim
input_dim = input_dim * downsample_ratio * downsample_ratio
fwd = 2 * input_dim * cfg.n_embed + (mlp_depth - 1) * 2 * cfg.n_embed * cfg.n_embed
else:
fwd = 0
return fwd * 3
import torch
import torch.nn as nn
import transformers
class CustomQwen2Decoder(nn.Module):
"""
Qwen2 visual encoder
non-causal attention + causal attention
token_type_ids :0=non-causal, 1=causal
"""
def __init__(
self,
decoder_layer: int = 24,
max_position_embeddings: int = 131072,
hidden_dimension: int = 896,
num_attention_heads: int = 14,
num_key_value_heads: int = 2,
intermediate_size: int = 4864,
vocab_size: int = 151936,
attn_implementation: str = "sdpa", # ⭐
rms_norm_eps: float = 1e-06,
rope_theta: float = 1000000.0,
attention_dropout: float = 0.0,
hidden_act: str = "silu",
initializer_range: float = 0.02,
):
super().__init__()
# attn_implementation check
if attn_implementation == "flash_attention_2":
raise ValueError(
"CustomQwen2Decoder do not support flash_attention_2,"
"new attention mask needs 'sdpa' or 'eager'"
)
# load
Qwen2Model = getattr(transformers.models.qwen2.modeling_qwen2, 'Qwen2Model')
Qwen2Config = getattr(transformers, 'Qwen2Config')
# config
config = Qwen2Config(
hidden_size=hidden_dimension,
num_hidden_layers=decoder_layer,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
intermediate_size=intermediate_size,
max_position_embeddings=max_position_embeddings,
vocab_size=vocab_size,
rms_norm_eps=rms_norm_eps,
rope_theta=rope_theta,
attention_dropout=attention_dropout,
hidden_act=hidden_act,
initializer_range=initializer_range,
_attn_implementation=attn_implementation, # ⭐
)
#
self.model = self._create_custom_model(Qwen2Model, config)
del self.model.embed_tokens
def _create_custom_model(self, Qwen2Model, config):
""" Qwen2Model """
class CustomQwen2ModelInner(Qwen2Model):
def forward(
self,
input_ids=None,
attention_mask=None,
position_ids=None,
past_key_values=None,
inputs_embeds=None,
token_type_ids=None, # ⭐
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
cache_position=None,
):
# token_type_ids
self._current_token_type_ids = token_type_ids
outputs = super().forward(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)
return outputs
def _update_causal_mask(
self,
attention_mask,
input_tensor,
cache_position,
past_key_values,
output_attentions,
):
dtype, device = input_tensor.dtype, input_tensor.device
min_dtype = torch.finfo(dtype).min
batch_size, sequence_length = input_tensor.shape[0], input_tensor.shape[1]
token_type_ids = self._current_token_type_ids
# attention mask
causal_mask = self._create_custom_4d_mask(
sequence_length=sequence_length,
dtype=dtype,
device=device,
batch_size=batch_size,
token_type_ids=token_type_ids,
)
# padding mask
if attention_mask is not None and attention_mask.dim() == 2:
padding_mask = attention_mask[:, None, None, :].to(dtype=dtype)
padding_mask = (1.0 - padding_mask) * min_dtype
causal_mask = causal_mask + padding_mask
return causal_mask
def _create_custom_4d_mask(
self,
sequence_length,
dtype,
device,
batch_size,
token_type_ids,
):
min_dtype = torch.finfo(dtype).min
masks = []
for b in range(batch_size):
mask = torch.full(
(sequence_length, sequence_length),
fill_value=min_dtype,
dtype=dtype,
device=device
)
type_ids = token_type_ids[b]
image_positions = (type_ids == 0).nonzero(as_tuple=True)[0]
text_positions = (type_ids == 1).nonzero(as_tuple=True)[0]
# non-casual
if len(image_positions) > 0:
mask[image_positions[:, None], image_positions] = 0.0
# causal
for i, text_pos in enumerate(text_positions):
if len(image_positions) > 0:
mask[text_pos, image_positions] = 0.0
mask[text_pos, text_positions[:i+1]] = 0.0
masks.append(mask)
mask = torch.stack(masks, dim=0).unsqueeze(1)
return mask
return CustomQwen2ModelInner(config)
def forward(
self,
inputs_embeds,
token_type_ids,
attention_mask=None,
**kwargs
):
"""
Args:
inputs_embeds: [batch_size, seq_len, hidden_dim]
token_type_ids: [batch_size, seq_len], 0=non-causal, 1=causal
attention_mask: [batch_size, seq_len], optional
"""
return self.model(
inputs_embeds=inputs_embeds,
token_type_ids=token_type_ids,
attention_mask=attention_mask,
**kwargs
)
# batch_size = 2
# inputs_embeds = torch.randn(batch_size, 512, 896).cuda()
# inputs_embeds = torch.randn(batch_size, 512, 896).cuda()
# token_type_ids = torch.cat([
# torch.zeros(batch_size, 256, dtype=torch.long),
# torch.ones(batch_size, 256, dtype=torch.long),
# ], dim=1).cuda()
# # start = time.time()
# with torch.no_grad():
# outputs_sdpa = decoder_sdpa(inputs_embeds, token_type_ids)
# print(outputs_sdpa[0].shape)
# print(f"SDPA time: {time.time() - start:.4f}s")
class Qwen2Decoder2Encoder(nn.Module):
"""
Decoder based on Multilingual BART
Set the initial weights and configuration with a pretrained multilingual BART model,
and modify the detailed configurations as a Nougat decoder
"""
def __init__(
self,
decoder_layer: int,
hidden_dimension: int,
num_attention_heads: int,
num_key_value_heads: int,
intermediate_size: int,
max_query: int,
):
super().__init__()
self.model = CustomQwen2Decoder(
decoder_layer=decoder_layer,
hidden_dimension=hidden_dimension,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
intermediate_size=intermediate_size,
attn_implementation="sdpa",
)
self.query_768 = nn.Embedding(144, hidden_dimension)
self.query_1024 = nn.Embedding(256, hidden_dimension)
# self.query_refixation = nn.Embedding(int(math.sqrt(max_query)), hidden_dimension)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x.flatten(2).transpose(1, 2)
bs, n_query, _ = x.shape
if n_query == 144:
param_img = self.query_768.weight
elif n_query == 256:
param_img = self.query_1024.weight
batch_query_imgs = param_img.unsqueeze(0).expand(
bs, -1, -1
) # (batch_size, num_queries, hidden_size)
x_combined = torch.cat([x, batch_query_imgs], dim=1)
token_type_ids = torch.cat([
torch.zeros(bs, n_query, dtype=torch.long),
torch.ones(bs, n_query, dtype=torch.long),
], dim=1)
y = self.model(x_combined, token_type_ids)[0]
y = y[:, n_query:, :] # causal flow query
return y
def build_qwen2_decoder_as_encoder(
decoder_layer=24,
hidden_dimension=896,
num_attention_heads=14,
num_key_value_heads=2,
intermediate_size=4864,
max_query = 400,
checkpoint=None,
):
decoder_as_encoder = Qwen2Decoder2Encoder(
decoder_layer=decoder_layer,
hidden_dimension = hidden_dimension,
num_attention_heads = num_attention_heads,
num_key_value_heads = num_key_value_heads,
intermediate_size = intermediate_size,
max_query = max_query
)
if checkpoint is not None:
# with open(checkpoint, "rb") as f:
state_dict = torch.load(checkpoint)
decoder_as_encoder.load_state_dict(state_dict, strict=True)
# tob
print(checkpoint)
return decoder_as_encoder
if __name__ == '__main__':
x = torch.zeros(2, 896, 16, 16).cuda()
net = build_qwen2_decoder_as_encoder(checkpoint = '').cuda()
y = net(x)
# y = y.flatten(2).permute(0, 2, 1)
print('-------shape---------')
print(y.shape)
print('-------------------')
\ No newline at end of file
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Type
from functools import partial
from flash_attn import flash_attn_qkvpacked_func
# from .common import LayerNorm2d, MLPBlock
# from mmgpt.model.vision_encoder.flash_4 import _attention_rel_h_rel_w
def get_abs_pos(abs_pos, tgt_size):
dtype = abs_pos.dtype
src_size = abs_pos.size(1)
if src_size != tgt_size:
old_pos_embed = abs_pos.permute(0, 3, 1, 2)
old_pos_embed = old_pos_embed.to(torch.float32)
new_pos_embed = F.interpolate(
old_pos_embed,
size=(tgt_size, tgt_size),
mode='bicubic',
antialias=True,
align_corners=False,
).to(dtype)
new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
return new_pos_embed
else:
return abs_pos
class MLPBlock(nn.Module):
def __init__(
self,
embedding_dim: int,
mlp_dim: int,
act: Type[nn.Module] = nn.GELU,
) -> None:
super().__init__()
self.lin1 = nn.Linear(embedding_dim, mlp_dim)
self.lin2 = nn.Linear(mlp_dim, embedding_dim)
self.act = act()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.lin2(self.act(self.lin1(x)))
# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
class LayerNorm2d(nn.Module):
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(num_channels))
self.bias = nn.Parameter(torch.zeros(num_channels))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
class ImageEncoderViT(nn.Module):
def __init__(
self,
img_size: int = 1024,
patch_size: int = 16,
in_chans: int = 3,
embed_dim: int = 768,
depth: int = 12,
num_heads: int = 12,
mlp_ratio: float = 4.0,
out_chans: int = 256,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_abs_pos: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
global_attn_indexes: Tuple[int, ...] = (),
) -> None:
"""
Args:
img_size (int): Input image size.
patch_size (int): Patch size.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
depth (int): Depth of ViT.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_abs_pos (bool): If True, use absolute positional embeddings.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks.
global_attn_indexes (list): Indexes for blocks using global attention.
"""
super().__init__()
self.img_size = img_size
self.patch_embed = PatchEmbed(
kernel_size=(patch_size, patch_size),
stride=(patch_size, patch_size),
in_chans=in_chans,
embed_dim=embed_dim,
)
self.pos_embed: Optional[nn.Parameter] = None
if use_abs_pos:
# Initialize absolute positional embedding with pretrain image size.
self.pos_embed = nn.Parameter(
torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
)
self.blocks = nn.ModuleList()
for i in range(depth):
block = Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
norm_layer=norm_layer,
act_layer=act_layer,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size if i not in global_attn_indexes else 0,
input_size=(img_size // patch_size, img_size // patch_size),
)
self.blocks.append(block)
self.neck = nn.Sequential(
nn.Conv2d(
embed_dim,
out_chans,
kernel_size=1,
bias=False,
),
LayerNorm2d(out_chans),
nn.Conv2d(
out_chans,
out_chans,
kernel_size=3,
padding=1,
bias=False,
),
LayerNorm2d(out_chans),
)
self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
self.net_3 = nn.Conv2d(512, 896, kernel_size=3, stride=2, padding=1, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.patch_embed(x)
if self.pos_embed is not None:
# x = x + self.pos_embed
x = x + get_abs_pos(self.pos_embed, x.size(1))
for blk in self.blocks:
x = blk(x)
neck_output = self.neck(x.permute(0, 3, 1, 2))
conv2_output = self.net_2(neck_output)
# print(f"conv2_output shape: {conv2_output.shape}")
conv3_output = self.net_3(conv2_output)
return conv3_output
class Block(nn.Module):
"""Transformer blocks with support of window attention and residual propagation blocks"""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = True,
norm_layer: Type[nn.Module] = nn.LayerNorm,
act_layer: Type[nn.Module] = nn.GELU,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
window_size: int = 0,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads in each ViT block.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
norm_layer (nn.Module): Normalization layer.
act_layer (nn.Module): Activation layer.
use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
window_size (int): Window size for window attention blocks. If it equals 0, then
use global attention.
input_size (tuple(int, int) or None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
input_size=input_size if window_size == 0 else (window_size, window_size),
)
self.norm2 = norm_layer(dim)
self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
self.window_size = window_size
def forward(self, x: torch.Tensor) -> torch.Tensor:
shortcut = x
x = self.norm1(x)
# Window partition
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
x = self.attn(x)
# Reverse window partition
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
x = shortcut + x
x = x + self.mlp(self.norm2(x))
return x
class Attention(nn.Module):
"""Multi-head Attention block with relative position embeddings."""
def __init__(
self,
dim: int,
num_heads: int = 8,
qkv_bias: bool = True,
use_rel_pos: bool = False,
rel_pos_zero_init: bool = True,
input_size: Optional[Tuple[int, int]] = None,
) -> None:
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
qkv_bias (bool): If True, add a learnable bias to query, key, value.
rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
input_size (tuple(int, int) or None): Input resolution for calculating the relative
positional parameter size.
"""
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
assert (
input_size is not None
), "Input size must be provided if using relative positional encoding."
# initialize relative positional embeddings
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, H, W, _ = x.shape
# qkv with shape (3, B, nHead, H * W, C)
qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
# q, k, v with shape (B * nHead, H * W, C)
q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
rel_h, rel_w = None, None
if self.use_rel_pos:
rel_h, rel_w = add_decomposed_rel_pos(q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
q = q.view(B, self.num_heads, H * W, -1)
k = k.view(B, self.num_heads, H * W, -1)
v = v.view(B, self.num_heads, H * W, -1)
if self.use_rel_pos:
rel_h = rel_h.view(B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3))
rel_w = rel_w.view(B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3))
attn_bias = (rel_h + rel_w).view(B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4))
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
# x = _attention_rel_h_rel_w(q, k, v, rel_h, rel_w)
else:
x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
# qkv = torch.stack([q, k, v], dim=1).transpose(1, 3).reshape(B, H * W, 3, self.num_heads, -1)
# x = flash_attn_qkvpacked_func(qkv, dropout_p=0.0, causal=False).transpose(1, 2)
x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
x = self.proj(x)
return x
def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(
windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
) -> torch.Tensor:
"""
Window unpartition into original sequences and removing padding.
Args:
windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
dtype = rel_pos.dtype
rel_pos = rel_pos.to(torch.float32)
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
).to(dtype)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size, device=rel_pos.device)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size, device=rel_pos.device)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(
q: torch.Tensor,
rel_pos_h: torch.Tensor,
rel_pos_w: torch.Tensor,
q_size: Tuple[int, int],
k_size: Tuple[int, int],
) -> torch.Tensor:
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Args:
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
rel_h = rel_h.unsqueeze(-1)
rel_w = rel_w.unsqueeze(-2)
rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)
rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)
return rel_h, rel_w
class PatchEmbed(nn.Module):
"""
Image to Patch Embedding.
"""
def __init__(
self,
kernel_size: Tuple[int, int] = (16, 16),
stride: Tuple[int, int] = (16, 16),
padding: Tuple[int, int] = (0, 0),
in_chans: int = 3,
embed_dim: int = 768,
) -> None:
"""
Args:
kernel_size (Tuple): kernel size of the projection layer.
stride (Tuple): stride of the projection layer.
padding (Tuple): padding size of the projection layer.
in_chans (int): Number of input image channels.
embed_dim (int): Patch embedding dimension.
"""
super().__init__()
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.proj(x)
# B C H W -> B H W C
x = x.permute(0, 2, 3, 1)
return x
def build_sam_vit_b(checkpoint=None):
return _build_sam(
encoder_embed_dim=768,
encoder_depth=12,
encoder_num_heads=12,
encoder_global_attn_indexes=[2, 5, 8, 11],
checkpoint=checkpoint,
)
def _build_sam(
encoder_embed_dim,
encoder_depth,
encoder_num_heads,
encoder_global_attn_indexes,
checkpoint=None,
):
prompt_embed_dim = 256
image_size = 1024
vit_patch_size = 16
image_embedding_size = image_size // vit_patch_size
image_encoder=ImageEncoderViT(
depth=encoder_depth,
embed_dim=encoder_embed_dim,
img_size=image_size,
mlp_ratio=4,
norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
num_heads=encoder_num_heads,
patch_size=vit_patch_size,
qkv_bias=True,
use_rel_pos=True,
global_attn_indexes=encoder_global_attn_indexes,
window_size=14,
out_chans=prompt_embed_dim,
)
if checkpoint is not None:
# with open(checkpoint, "rb") as f:
state_dict = torch.load(checkpoint)
# print(state_dict.keys())
# for key in state_dict:
# image_encoder.load_state_dict({k[14:]: v for k, v in state_dict.items() if 'image_encoder' in k}, strict=False)
# ocr-anyting
# image_encoder.load_state_dict(state_dict, strict=True)
# tob
image_encoder.load_state_dict({k[30:]: v for k, v in state_dict.items() if 'vision_tower_high' in k}, strict=True)
print(checkpoint)
return image_encoder
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
MlpProjectorConfig,
VisionEncoderConfig)
from process.image_process import (
DeepseekOCR2Processor, count_tiles)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
# from vllm.utils import is_list_of
from vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings)
from deepencoderv2.sam_vary_sdpa import build_sam_vit_b
# from deepencoder.clip_sdpa import build_clip_l
from deepencoderv2.qwen2_d2e import build_qwen2_decoder_as_encoder
from deepencoderv2.build_linear import MlpProjector
from addict import Dict
# import time
from config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, PRINT_NUM_VIS_TOKENS, PROMPT
# The image token id may be various
_IMAGE_TOKEN = "<image>"
class DeepseekOCR2ProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(DeepseekVLV2Config)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(DeepseekOCR2Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
def get_num_image_tokens(self,
*,
image_width: int,
image_height: int,
cropping: bool = True) -> int:
hf_processor = self.get_hf_processor()
# image_size = hf_processor.image_size
# patch_size = hf_processor.patch_size
# downsample_ratio = hf_processor.downsample_ratio
image_size = IMAGE_SIZE
base_size = BASE_SIZE
patch_size = 16
downsample_ratio = 4
if CROP_MODE:
if image_width <= 768 and image_height <= 768:
crop_ratio = [1, 1]
else:
# images_crop_raw, crop_ratio = hf_processor.dynamic_preprocess(image)
# find the closest aspect ratio to the target
crop_ratio = count_tiles(image_width, image_height, image_size=IMAGE_SIZE)
# print('===========')
# print('crop_ratio ', crop_ratio)
# print('============')
num_width_tiles, num_height_tiles = crop_ratio
else:
num_width_tiles = num_height_tiles = 1
h = w = math.ceil((base_size // patch_size) / downsample_ratio)
h2 = w2 = math.ceil((image_size // patch_size) / downsample_ratio)
global_views_tokens = h * (w)
if num_width_tiles >1 or num_height_tiles>1:
local_views_tokens = (num_height_tiles * h2) * (num_width_tiles * w2)
else:
local_views_tokens = 0
return global_views_tokens + local_views_tokens + 1
def get_image_size_with_most_features(self) -> ImageSize:
if IMAGE_SIZE == 1024 and BASE_SIZE == 1280:
return ImageSize(width=1024*2, height=1024*2)
return ImageSize(width=768*2, height=768*2)
class DeepseekOCR2DummyInputsBuilder(
BaseDummyInputsBuilder[DeepseekOCR2ProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
image_token = processor.image_token
return image_token * num_images
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
max_image_size = self.info.get_image_size_with_most_features()
if '<image>' in PROMPT:
return {
"image":
DeepseekOCR2Processor().tokenize_with_images(images = self._get_dummy_images(width=max_image_size.width,
height=max_image_size.height,
num_images=num_images), bos=True, eos=True, cropping=CROP_MODE)
}
else:
return {
"image": []
}
class DeepseekOCR2MultiModalProcessor(
BaseMultiModalProcessor[DeepseekOCR2ProcessingInfo]):
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
# print(mm_data)
if mm_data:
processed_outputs = self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(prompt=prompt, **mm_data),
mm_kwargs,
)
else:
tokenizer = self.info.get_tokenizer()
processed_outputs = tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
return processed_outputs
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(
pixel_values=MultiModalFieldConfig.batched("image"),
images_spatial_crop=MultiModalFieldConfig.batched("image"),
# image_embeds=MultiModalFieldConfig.batched("image2"),
images_crop=MultiModalFieldConfig.batched("image"),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = hf_processor.image_token_id
assert isinstance(image_token_id, int)
def get_replacement_deepseek_vl2(item_idx: int):
images = mm_items.get_items(
"image", (ImageEmbeddingItems, ImageProcessorItems))
if isinstance(images, ImageEmbeddingItems):
num_image_tokens = images.get_feature_size(item_idx)
else:
width = images[0][-1][0][0]
height = images[0][-1][0][1]
num_image_tokens = self.info.get_num_image_tokens(
image_width=width,
image_height=height,
# flag = True,
cropping=CROP_MODE,
)
return [image_token_id] * num_image_tokens
return [
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=get_replacement_deepseek_vl2,
)
]
def _cached_apply_hf_processor(
self,
prompt: Union[str, list[int]],
mm_data_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
) -> tuple[list[int], MultiModalKwargs, bool]:
# The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only
# perform caching for the most common case
if mm_data_items.get_count("image", strict=False) > 2:
# This code path corresponds to the cache being disabled
return self._apply_hf_processor_main(
prompt=prompt,
mm_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
enable_hf_prompt_update=True,
)
return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
@MULTIMODAL_REGISTRY.register_processor(
DeepseekOCR2MultiModalProcessor,
info=DeepseekOCR2ProcessingInfo,
dummy_inputs=DeepseekOCR2DummyInputsBuilder)
class DeepseekOCR2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
"language.": "language_model.",
})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: DeepseekVLV2Config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config
# config.model_type ='deepseek_vl_v2'
self.config = config
self.multimodal_config = multimodal_config
self.vision_config = config.vision_config
self.projector_config = config.projector_config
self.text_config = config.text_config
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
self.sam_model = build_sam_vit_b()
self.qwen2_model = build_qwen2_decoder_as_encoder()
n_embed = 1280
self.projector = MlpProjector(Dict(projector_type="linear", input_dim=896, n_embed=n_embed))
self.tile_tag = config.tile_tag
self.global_view_pos = config.global_view_pos
# self.sam_model = torch.compile(self.sam_model, mode="reduce-overhead")
# self.vision_model = torch.compile(self.vision_model, mode="reduce-overhead")
# self.projector = torch.compile(self.projector, mode="max-autotune")
# special token for image token sequence format
embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
if self.tile_tag == "2D":
# <|view_separator|>, <|\n|>
# self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
else:
raise ValueError(
f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
)
if self.text_config.topk_method == "noaux_tc":
architectures = ["DeepseekV3ForCausalLM"]
# architectures = ["DeepseekForCausalLM"]
elif not self.text_config.use_mla:
architectures = ["DeepseekForCausalLM"]
else:
architectures = ["DeepseekV2ForCausalLM"]
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=self.text_config,
prefix=maybe_prefix(prefix, "language"),
architectures=architectures,
)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)
self.sam_model.to(dtype=torch.bfloat16)
def _parse_and_validate_image_input(
self, **kwargs: object):
pixel_values = kwargs.pop("pixel_values", None)
images_spatial_crop = kwargs.pop("images_spatial_crop", None)
images_crop = kwargs.pop("images_crop", None)
if pixel_values is None or torch.sum(pixel_values).item() == 0:
return None
if pixel_values is not None:
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
if not isinstance(images_spatial_crop, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(images_spatial_crop)}")
if not isinstance(images_crop, (torch.Tensor, list)):
raise ValueError("Incorrect type of image crop. "
f"Got type: {type(images_crop)}")
return [pixel_values, images_crop, images_spatial_crop]
raise AssertionError("This line should be unreachable.")
def _pixel_values_to_embedding(
self,
pixel_values: torch.Tensor,
images_crop: torch.Tensor,
images_spatial_crop: torch.Tensor,
) -> NestedTensors:
# Pixel_values (global view): [n_image, batch_size, 3, height, width]
# images_spatial_crop: [n_image, batch_size, [num_tiles_w, num_tiles_h]]
# images_crop (local view): [n_image, batch_size, num_pathes, 3, h, w]
# split the pixel and image_crop, all batch_size = 1
images_in_this_batch = []
with torch.no_grad():
for jdx in range(images_spatial_crop.size(0)):
# with torch.set_grad_enabled(False):
patches = images_crop[jdx][0].to(torch.bfloat16) # batch_size = 1
# patches = images_crop[jdx][0]
image_ori = pixel_values[jdx]
crop_shape = images_spatial_crop[jdx][0]
if torch.sum(patches).item() != 0: # if all values = 0, no crop
# P, C, H, W = patches.shape
# crop_flag = 1
local_features_1 = self.sam_model(patches)
#TODO del patches
# torch.compiler.cudagraph_mark_step_begin()
local_features_2 = self.qwen2_model(local_features_1)
# local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
local_features = self.projector(local_features_2)
global_features_1 = self.sam_model(image_ori)
global_features_2 = self.qwen2_model(global_features_1)
# global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
global_features = self.projector(global_features_2)
if PRINT_NUM_VIS_TOKENS:
print('=====================')
print('BASE: ', global_features.shape)
print('PATCHES: ', local_features.shape)
print('=====================')
_, hw, n_dim = global_features.shape
# h = w = int(hw ** 0.5)
_2, hw2, n_dim2 = local_features.shape
# h2 = w2 = int(hw2 ** 0.5)
# width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]
# global_features = global_features.view(h, w, n_dim)
# global_features = torch.cat(
# [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
# )
global_features = global_features.view(-1, n_dim)
# local_features = local_features.view(height_crop_num, width_crop_num, h2, w2, n_dim2).permute(0, 2, 1, 3, 4).reshape(height_crop_num*h2, width_crop_num*w2, n_dim2)
# local_features = torch.cat(
# [local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1
# )
local_features = local_features.view(-1, n_dim2)
global_local_features = torch.cat([local_features, global_features, self.view_seperator[None, :]], dim=0)
else:
global_features_1 = self.sam_model(image_ori)
global_features_2 = self.qwen2_model(global_features_1)
# global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
global_features = self.projector(global_features_2)
if PRINT_NUM_VIS_TOKENS:
print('=====================')
print('BASE: ', global_features.shape)
print('NO PATCHES')
print('=====================')
_, hw, n_dim = global_features.shape
# h = w = int(hw ** 0.5)
# global_features = global_features.view(h, w, n_dim)
# global_features = torch.cat(
# [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
# )
global_features = global_features.view(-1, n_dim)
global_local_features = torch.cat([global_features, self.view_seperator[None, :]], dim=0)
images_in_this_batch.append(global_local_features)
return images_in_this_batch
def _process_image_input(
self, image_input) -> torch.Tensor:
# image_input: [pixel_values, images_crop, images_spatial_crop]
pixel_values = image_input[0].to(torch.bfloat16)
# images_crop = image_input[1].to(torch.bfloat16)
images_crop = image_input[1]
# images_crop = image_input[1]
images_spatial_crop = image_input[2].to(dtype=torch.long)
# local_start = time.time()
vision_features = self._pixel_values_to_embedding(
pixel_values=pixel_values, images_crop = images_crop, images_spatial_crop=images_spatial_crop)
return vision_features
def get_language_model(self) -> torch.nn.Module:
return self.language_model
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
# input_ids.to(torch.bfloat16)
# self.image_token_id.to(torch.bfloat16)
if multimodal_embeddings is not None:
# multimodal_embeddings = multimodal_embeddings.to(torch.bfloat16)
# multimodal_embeddings = [emb.to(torch.bfloat16) for emb in multimodal_embeddings]
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
return inputs_embeds
def forward(self,
input_ids: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object):
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
hidden_states = self.language_model(input_ids,
positions,
intermediate_tensors,
inputs_embeds=inputs_embeds)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
return self.language_model.compute_logits(hidden_states,
sampling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
processed_weights = []
for name, tensor in weights:
if 'sam_model' in name or 'qwen2_model' in name or 'projector' in name or 'view_seperator' in name:
new_name = name.replace('model.', '', 1)
else:
new_name = 'language.' + name
# tensor = tensor.to(torch.bfloat16)
processed_weights.append((new_name, tensor))
loader = AutoWeightsLoader(self)
autoloaded_weights = loader.load_weights(processed_weights, mapper=self.hf_to_vllm_mapper)
return autoloaded_weights
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DeepSeek OCR API Server (vLLM) - 极简版 + 优化版
"""
import os
import io
import re
import argparse
import asyncio
from io import BytesIO
from typing import List
from concurrent.futures import ThreadPoolExecutor
import torch
from PIL import Image
try:
import fitz
except Exception:
fitz = None
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from vllm import LLM, SamplingParams
from vllm.model_executor.models.registry import ModelRegistry
from deepseek_ocr2 import DeepseekOCR2ForCausalLM
from process.ngram_norepeat import NoRepeatNGramLogitsProcessor
from process.image_process import DeepseekOCR2Processor
app = FastAPI(title="DeepSeek OCR API (vLLM) - Optimized", version="2.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
llm = None
cpu_executor = ThreadPoolExecutor(max_workers=8, thread_name_prefix="CPU-Worker")
gpu_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="GPU-Worker")
vllm_lock = asyncio.Lock()
PROMPT_OCR = "<image>\n<|grounding|>Convert the document to markdown."
PROMPT_DESC = "<image>\nDescribe this image in detail."
# -----------------------
# Monkey Patch
# -----------------------
_original_tokenize = DeepseekOCR2Processor.tokenize_with_images
def _patched_tokenize(self, images, bos=True, eos=True, cropping=True, prompt=None):
if prompt is not None:
import config
old = config.PROMPT
config.PROMPT = prompt
try:
return _original_tokenize(self, images, bos, eos, cropping)
finally:
config.PROMPT = old
return _original_tokenize(self, images, bos, eos, cropping)
DeepseekOCR2Processor.tokenize_with_images = _patched_tokenize
def pdf_to_images_sync(pdf_bytes: bytes, dpi: int = 144) -> List[Image.Image]:
"""PDF 转图片 """
if fitz is None:
raise RuntimeError("Please install PyMuPDF")
images = []
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
matrix = fitz.Matrix(dpi / 72.0, dpi / 72.0)
for page in doc:
pix = page.get_pixmap(matrix=matrix, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("png")))
if img.mode != "RGB":
if img.mode in ("RGBA", "LA"):
bg = Image.new("RGB", img.size, (255, 255, 255))
bg.paste(img, mask=img.split()[-1])
img = bg
else:
img = img.convert("RGB")
images.append(img)
doc.close()
return images
def image_open_sync(image_bytes: bytes) -> Image.Image:
"""打开图片 (同步版本)"""
return Image.open(BytesIO(image_bytes)).convert("RGB")
def clear_vllm_cache_sync():
"""清理 vLLM 缓存 (同步版本)"""
if llm is None:
return
try:
if hasattr(llm.llm_engine, 'input_preprocessor'):
prep = llm.llm_engine.input_preprocessor
if hasattr(prep, '_mm_processor_cache'):
prep._mm_processor_cache.clear()
except:
pass
def tokenize_image_sync(image: Image.Image, prompt: str):
"""
图像 tokenize (同步版本, CPU 密集)
WARNING: 这是最大的优化点!
"""
processor = DeepseekOCR2Processor()
return processor.tokenize_with_images(images=[image], prompt=prompt)
def vllm_generate_sync(tokenized, prompt: str) -> str:
"""
vLLM 推理 (同步版本, GPU 密集)
注意: tokenized 已经在 CPU 线程池完成
"""
batch_inputs = [{
"prompt": prompt,
"multi_modal_data": {"image": tokenized}
}]
if prompt == PROMPT_OCR:
logits_proc = [NoRepeatNGramLogitsProcessor(ngram_size=20, window_size=50, whitelist_token_ids= {128821, 128822})]
params = SamplingParams(
temperature=0.0,
max_tokens=8192,
skip_special_tokens=False,
logits_processors=logits_proc,
repetition_penalty=1.05,
include_stop_str_in_output=True,
)
else:
params = SamplingParams(
temperature=0.0,
max_tokens=8192,
skip_special_tokens=False,
include_stop_str_in_output=True,
)
outputs = llm.generate(batch_inputs, params)
return outputs[0].outputs[0].text
def clean_markdown_sync(text: str) -> str:
"""清理 Markdown (同步版本)"""
text = re.sub(r'<\|ref\|>.*?<\|/ref\|>', '', text)
text = re.sub(r'<\|det\|>.*?<\|/det\|>', '', text)
text = re.sub(r'<\|.*?\|>', '', text)
text = re.sub(r'\[\[.*?\]\]', '', text)
text = re.sub(r'={50,}.*?={50,}', '', text, flags=re.DOTALL)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
async def pdf_to_images_async(pdf_bytes: bytes, dpi: int = 144) -> List[Image.Image]:
"""PDF 转图片 (异步)"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(cpu_executor, pdf_to_images_sync, pdf_bytes, dpi)
async def image_open_async(image_bytes: bytes) -> Image.Image:
"""打开图片 (异步)"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(cpu_executor, image_open_sync, image_bytes)
async def tokenize_image_async(image: Image.Image, prompt: str):
"""
图像 tokenize (异步)
NOTE: 关键优化: 在 CPU 线程池执行
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(cpu_executor, tokenize_image_sync, image, prompt)
async def vllm_generate_async(image: Image.Image, prompt: str) -> str:
"""
完整的 vLLM 推理流程 (异步)
优化: 分离 tokenize (CPU) 和 generate (GPU)
"""
# 步骤1: tokenize (CPU 密集, 在 CPU 线程池执行)
tokenized = await tokenize_image_async(image, prompt)
# 步骤2: GPU 推理 (GPU 密集, 在 GPU 线程池执行, 有锁保护)
async with vllm_lock:
# 清理缓存 (在 GPU 线程池执行)
loop = asyncio.get_event_loop()
await loop.run_in_executor(gpu_executor, clear_vllm_cache_sync)
# GPU 推理
result = await loop.run_in_executor(
gpu_executor,
vllm_generate_sync,
tokenized,
prompt
)
return result
async def clean_markdown_async(text: str) -> str:
"""清理 Markdown (异步)"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(cpu_executor, clean_markdown_sync, text)
async def generate_image_description_async(image: Image.Image) -> str:
"""生成图片描述 (异步)"""
try:
# GPU 推理
result = await vllm_generate_async(image, PROMPT_DESC)
# CPU 后处理
loop = asyncio.get_event_loop()
def process_desc(text):
desc = re.sub(r'<\|ref\|>.*?<\|/ref\|>', '', text)
desc = re.sub(r'<\|det\|>.*?<\|/det\|>', '', desc)
desc = re.sub(r'<\|.*?\|>', '', desc)
desc = re.sub(r'\[\[.*?\]\]', '', desc)
desc = re.sub(r'\s+', ' ', desc).strip()
if len(desc) > 200:
cutoff = desc[:200].rfind('.')
if cutoff > 100:
desc = desc[:cutoff + 1]
else:
desc = desc[:200].rsplit(' ', 1)[0] + '...'
return desc
desc = await loop.run_in_executor(cpu_executor, process_desc, result)
return desc
except Exception as e:
print(f"WARNING: 图片描述失败: {e}")
return ""
# -----------------------
# 模型初始化
# -----------------------
def initialize_model(model_path: str, gpu_id: int):
global llm
ModelRegistry.register_model("DeepseekOCR2ForCausalLM", DeepseekOCR2ForCausalLM)
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
os.environ['VLLM_USE_V1'] = '0'
print(f"[INFO] 加载模型: {model_path}")
llm = LLM(
model=model_path,
hf_overrides={"architectures": ["DeepseekOCR2ForCausalLM"]},
block_size=64,
enforce_eager=False,
trust_remote_code=True,
max_model_len=8192,
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
max_num_seqs=20,
disable_mm_preprocessor_cache=True,
swap_space=0,
)
print("[SUCCESS] 模型加载完成")
#print(f"[INFO] 线程池配置:")
#print(f" - CPU 线程池: {cpu_executor._max_workers} 线程")
#print(f" - GPU 线程池: {gpu_executor._max_workers} 线程")
# -----------------------
# API 路由
# -----------------------
@app.get("/")
async def root():
return {
"service": "DeepSeek OCR2 (vLLM) - Optimized",
"version": "2.0.0",
"status": "running"
}
@app.get("/health")
async def health():
return {
"status": "healthy",
"model_ready": llm is not None,
"cpu_workers": cpu_executor._max_workers,
"gpu_workers": gpu_executor._max_workers,
}
async def vllm_generate_batch_async(
images: List[Image.Image],
prompt: str,
show_progress: bool = True
) -> List[str]:
"""
批量 vLLM 推理 - 真正的批处理优化
Args:
images: 图片列表
prompt: 提示词
show_progress: 是否显示进度
Returns:
生成的文本列表
"""
total = len(images)
# 步骤1: 并发 tokenize
# 标准化图片 -> Vision Encoder (ViT) -> 图像特征向量 (例如:[196, 1024] - 196个位置,每个1024维)
if show_progress:
print(f" [1/3] Tokenize {total} 页...")
tokenize_tasks = [tokenize_image_async(img, prompt) for img in images]
all_tokenized = await asyncio.gather(*tokenize_tasks)
if show_progress:
print(f" [1/3] Tokenize 完成")
# 步骤2: 构造批量输入
batch_inputs = [
{
"prompt": prompt,
"multi_modal_data": {"image": tok}
}
for tok in all_tokenized
]
# 步骤3: 批量 GPU 推理
async with vllm_lock:
if show_progress:
print(f" [2/3] GPU 批量推理 {total} 页...")
loop = asyncio.get_event_loop()
# 清理缓存
await loop.run_in_executor(gpu_executor, clear_vllm_cache_sync)
# 批量推理
def batch_generate():
# 根据 prompt 类型选择参数
if prompt == PROMPT_OCR:
logits_proc = [NoRepeatNGramLogitsProcessor(20, 50, {128821, 128822})]
params = SamplingParams(
temperature=0.0,
max_tokens=8192,
skip_special_tokens=False,
logits_processors=logits_proc,
include_stop_str_in_output=True,
)
else:
params = SamplingParams(
temperature=0.0,
max_tokens=8192,
skip_special_tokens=False,
include_stop_str_in_output=True,
)
# NOTE: 关键: 批量调用
outputs = llm.generate(batch_inputs, params)
return [out.outputs[0].text for out in outputs]
results = await loop.run_in_executor(gpu_executor, batch_generate)
if show_progress:
print(f" [2/3] GPU 推理完成")
return results
@app.post("/ocr")
async def ocr(
file: UploadFile = File(...),
enable_description: bool = Form(False),
):
"""OCmR 接口 (批量处理)"""
if llm is None:
raise HTTPException(503, "模型未加载")
import time
start_time = time.time()
try:
# 1. 读取文件
contents = await file.read()
t1 = time.time()
# 2. 解析文件
if file.filename.lower().endswith('.pdf'):
# 如果是PDF,则转换为图片列表
images = await pdf_to_images_async(contents)
else:
# 如果是图片,则直接打开
images = [await image_open_async(contents)]
t2 = time.time()
# 3. 批量 OCR
raw_results = await vllm_generate_batch_async(images, PROMPT_OCR)
t3 = time.time()
print(f" OCR 耗时: {t3 - t2:.2f}s")
# 4. 后处理
print(f" [3/3] 后处理...")
async def postprocess(idx: int, raw: str, img: Image.Image) -> str:
# 图片描述
if enable_description:
img_pattern = r'<\|ref\|>image<\|/ref\|><\|det\|>\[\[.*?\]\]<\|/det\|>'
matches = list(re.finditer(img_pattern, raw))
for match in matches:
desc = await generate_image_description_async(img)
replacement = f"[图片: {desc}]" if desc else "[图片]"
raw = raw.replace(match.group(0), replacement)
# 清理 Markdown
cleaned = await clean_markdown_async(raw)
return cleaned if cleaned else ""
tasks = [postprocess(i, raw, img) for i, (raw, img) in enumerate(zip(raw_results, images))]
md_parts = await asyncio.gather(*tasks)
t4 = time.time()
print(f" [3/3] 后处理完成 ({t4 - t3:.2f}s)")
# 5. 合并结果
final_md = "\n\n".join([md for md in md_parts if md])
total_time = time.time() - start_time
print(f"{'='*60}")
print(f"[SUCCESS] 全部完成")
print(f" 总耗时: {total_time:.2f}s")
print(f" 平均: {total_time / len(images):.2f}s/页")
print(f"{'='*60}\n")
return JSONResponse({
"markdown": final_md,
"page_count": len(images),
"processing_time": round(total_time, 2),
})
except Exception as e:
import traceback
print(f"[ERROR] 处理失败: {e}")
print(traceback.format_exc())
raise HTTPException(500, f"处理失败: {e}")
# -----------------------
# 优雅关闭
# -----------------------
@app.on_event("shutdown")
async def shutdown_event():
print("[INFO] 关闭线程池...")
cpu_executor.shutdown(wait=True)
gpu_executor.shutdown(wait=True)
print("[SUCCESS] 线程池已关闭")
# -----------------------
# 启动
# -----------------------
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", required=True, help="模型路径")
parser.add_argument("--gpu-id", type=int, default=0, help="GPU ID")
parser.add_argument("--port", type=int, default=8001, help="端口")
parser.add_argument("--host", default="0.0.0.0", help="监听地址")
parser.add_argument("--cpu-workers", type=int, default=2, help="CPU 线程池大小")
args = parser.parse_args()
# 更新线程池大小
global cpu_executor
cpu_executor = ThreadPoolExecutor(
max_workers=args.cpu_workers,
thread_name_prefix="CPU-Worker"
)
initialize_model(args.model_path, args.gpu_id)
print(f"\n[INFO] 服务启动: http://{args.host}:{args.port}")
print(f"[INFO] 接口文档: http://{args.host}:{args.port}/docs\n")
uvicorn.run(app, host=args.host, port=args.port, workers=1)
if __name__ == "__main__":
main()
# DeepSeek-OCR: Contexts Optical Compression
text
Haoran Wei, Yaofeng Sun, Yukun Li
text
DeepSeek-AI
sub_title
## Abstract
We present DeepSeek-OCR as an initial investigation into the feasibility of compressing long contexts via optical 2D mapping. DeepSeek-OCR consists of two components: DeepEncoder and DeepSeek3B-MoE-A570M as the decoder. Specifically, DeepEncoder serves as the core engine, designed to maintain low activations under high-resolution input while achieving high compression ratios to ensure an optimal and manageable number of vision tokens. Experiments show that when the number of text tokens is within 10 times that of vision tokens (i.e., a compression ratio \( <10\times \) ), the model can achieve decoding (OCR) precision of 97%. Even at a compression ratio of 20×, the OCR accuracy still remains at about 60%. This shows considerable promise for research areas such as historical long-context compression and memory forgetting mechanisms in LLMs. Beyond this, DeepSeek-OCR also demonstrates high practical value. On OmniDocBench, it surpasses GOT-OCR2.0 (256 tokens/page) using only 100 vision tokens, and outperforms MinerU2.0 (6000+ tokens per page on average) while utilizing fewer than 800 vision tokens. In production, DeepSeek-OCR can generate training data for LLMs/VLMs at a scale of 200k+ pages per day (a single A100-40G). Codes and model weights are publicly accessible at http://github.com/deepseek-ai/DeepSeek-OCR.
Figure 1 | Figure (a) shows the compression ratio (number of text tokens in ground truth/number of vision tokens model used) testing on Fox \( [21] \) benchmark; Figure (b) shows performance comparisons on OmniDocBench \( [27] \) . DeepSeek-OCR can achieve state-of-the-art performance among end-to-end models enjoying the fewest vision tokens.<endofsentence>
## sub_title
## Contents
1 Introduction 3
2 Related Works 4
2.1 Typical Vision Encoders in VLMs 4
2.2 End-to-end OCR Models 4
3 Methodology 5
3.1 Architecture 5
3.2 DeepEncoder 5
3.2.1 Architecture of DeepEncoder 5
3.2.2 Multiple resolution support 6
3.3 The MoE Decoder 7
3.4 Data Engine 7
3.4.1 OCR 1.0 data 7
3.4.2 OCR 2.0 data 8
3.4.3 General vision data 9
3.4.4 Text-only data 9
3.5 Training Pipelines 9
3.5.1 Training DeepEncoder 10
3.5.2 Training DeepSeek-OCR 10
4 Evaluation 10
4.1 Vision-text Compression Study 10
4.2 OCR Practical Performance 12
4.3 Qualitative Study 12
4.3.1 Deep parsing 12
4.3.2 Multilingual recognition 16
4.3.3 General vision understanding 17
5 Discussion 18
6 Conclusion 19<endofsentence>
## 1. Introduction
Current Large Language Models (LLMs) face significant computational challenges when processing long textual content due to quadratic scaling with sequence length. We explore a potential solution: leveraging visual modality as an efficient compression medium for textual information. A single image containing document text can represent rich information using substantially fewer tokens than the equivalent digital text, suggesting that optical compression through vision tokens could achieve much higher compression ratios.
This insight motivates us to reexamine vision-language models (VLMs) from an LLM-centric perspective, focusing on how vision encoders can enhance LLMs' efficiency in processing textual information rather than basic VQA \( [12, 16, 24, 32, 41] \) what humans excel at. OCR tasks, as an intermediate modality bridging vision and language, provide an ideal testbed for this vision-text compression paradigm, as they establish a natural compression-decompression mapping between visual and textual representations while offering quantitative evaluation metrics.
Accordingly, we present DeepSeek-OCR, a VLM designed as a preliminary proof-of-concept for efficient vision-text compression. Our work makes three primary contributions:
First, we provide comprehensive quantitative analysis of vision-text token compression ratios. Our method achieves 96%+ OCR decoding precision at 9-10× text compression, ~90% at 10-12× compression, and ~60% at 20× compression on Fox \( [21] \) benchmarks featuring diverse document layouts (with actual accuracy being even higher when accounting for formatting differences between output and ground truth), as shown in Figure 1(a). The results demonstrate that compact language models can effectively learn to decode compressed visual representations, suggesting that larger LLMs could readily acquire similar capabilities through appropriate pretraining design.
Second, we introduce DeepEncoder, a novel architecture that maintains low activation memory and minimal vision tokens even with high-resolution inputs. It serially connects window attention and global attention encoder components through a 16× convolutional compressor. This design ensures that the window attention component processes a large number of vision tokens, while the compressor reduces vision tokens before they enter the dense global attention component, achieving effective memory and token compression.
Third, we develop DeepSeek-OCR based on DeepEncoder and DeepSeek3B-MoE \( [19, 20] \) . As shown in Figure 1(b), it achieves state-of-the-art performance within end-to-end models on OmniDocBench while using the fewest vision tokens. Additionally, we equip the model with capabilities for parsing charts, chemical formulas, simple geometric figures, and natural images to enhance its practical utility further. In production, DeepSeek-OCR can generate 33 million pages of data per day for LLMs or VLMs using 20 nodes (each with 8 A100-40G GPUs).
In summary, this work presents a preliminary exploration of using visual modality as an efficient compression medium for textual information processing in LLMs. Through DeepSeek-OCR, we demonstrate that vision-text compression can achieve significant token reduction (7-20×) for different historical context stages, offering a promising direction for addressing long-context challenges in large language models. Our quantitative analysis provides empirical guidelines for VLM token allocation optimization, while the proposed DeepEncoder architecture showcases practical feasibility with real-world deployment capabilities. Although focused on OCR as a proof-of-concept, this paradigm opens new possibilities for rethinking how vision and language modalities can be synergistically combined to enhance computational efficiency in large-scale text processing and agent systems.<endofsentence>
Figure 2 | Typical vision encoders in popular VLMs. Here are three types of encoders commonly used in current open-source VLMs, all of which suffer from their respective deficiencies.
sub_title
## 2. Related Works
sub_title
2.1. Typical Vision Encoders in VLMs
Current open-source VLMs employ three main types of vision encoders, as illustrated in Figure 2. The first type is a dual-tower architecture represented by Vary [36], which utilizes parallel SAM [17] encoder to increase visual vocabulary parameters for high-resolution image processing. While offering controllable parameters and activation memory, this approach suffers from significant drawbacks: it requires dual image preprocessing that complicates deployment and makes encoder pipeline parallelism challenging during training. The second type is tile-based method exemplified by InternVL2.0 [8], which processes images by dividing them into small tiles for parallel computation, reducing activation memory under high-resolution settings. Although capable of handling extremely high resolutions, this approach has notable limitations due to its typically low native encoder resolution (below 512×512), causing large images to be excessively fragmented and resulting in numerous vision tokens. The third type is adaptive resolution encoding represented by Qwen2-VL [35], which adopts the NaViT [10] paradigm to directly process full images through patch-based segmentation without tile parallelization. While this encoder can handle diverse resolutions flexibly, it faces substantial challenges with large images due to massive activation memory consumption that can cause GPU memory overflow, and sequence packing requires extremely long sequence lengths during training. Long vision tokens will slow down both prefill and generation phases of inference.
## sub_title
2.2. End-to-end OCR Models
OCR, particularly document parsing task, has been a highly active topic in the image-to-text domain. With the advancement of VLMs, a large number of end-to-end OCR models have emerged, fundamentally transforming the traditional pipeline architecture (which required separate detection and recognition expert models) by simplifying OCR systems. Nougat [6] first employs end-to-end framework for academic paper OCR on arXiv, demonstrating the potential of models in handling dense perception tasks. GOT-OCR2.0 [38] expands the scope of OCR2.0 to include more synthetic image parsing tasks and designs an OCR model with performance-efficiency trade-offs, further highlighting the potential of end-to-end OCR researches. Additionally, general vision models such as Qwen-VL series [35], InternVL series [8], and many their derivatives continuously enhance their document OCR capabilities to explore dense visual perception boundaries. However, a crucial research question that current models have not addressed is: for a document containing 1000 words, how many vision tokens are at least needed for decoding? This question holds significant importance for research in the principle that "a picture is worth a thousand words."<endofsentence>
Figure 3 | The architecture of DeepSeek-OCR. DeepSeek-OCR consists of a DeepEncoder and a DeepSeek-3B-MoE decoder. DeepEncoder is the core of DeepSeek-OCR, comprising three components: a SAM \( [17] \) for perception dominated by window attention, a CLIP \( [29] \) for knowledge with dense global attention, and a \( 16\times \) token compressor that bridges between them.
## sub_title
## 3. Methodology
## 3.1. Architecture
As shown in Figure 3, DeepSeek-OCR enjoys a unified end-to-end VLM architecture consisting of an encoder and a decoder. The encoder (namely DeepEncoder) is responsible for extracting image features and tokenizing as well as compressing visual representations. The decoder is used for generating the required result based on image tokens and prompts. DeepEncoder is approximately 380M in parameters, mainly composed of an 80M SAM-base [17] and a 300M CLIP-large [29] connected in series. The decoder adopts a 3B MoE [19, 20] architecture with 570M activated parameters. In the following paragraphs, we will delve into the model components, data engineering, and training skills.
## sub_title
## 3.2. DeepEncoder
To explore the feasibility of contexts optical compression, we need a vision encoder with the following features: 1. Capable of processing high resolutions; 2. Low activation at high resolutions; 3. Few vision tokens; 4. Support for multiple resolution inputs; 5. Moderate parameter count. However, as described in the Section 2.1, current open-source encoders cannot fully satisfy all these conditions. Therefore, we design a novel vision encoder ourselves, named DeepEncoder.
## 3.2.1. Architecture of DeepEncoder
DeepEncoder mainly consists of two components: a visual perception feature extraction component dominated by window attention, and a visual knowledge feature extraction component with dense global attention. To benefit from the pretraining gains of previous works, we use SAM-base (patch-size 16) and CLIP-large as the main architectures for the two components respectively. For CLIP, we remove the first patch embedding layer since its input is no longer images but output tokens from the previous pipeline. Between the two components, we borrow from Vary [36] and use a 2-layer convolutional module to perform 16× downsampling of vision tokens. Each convolutional layer has a kernel size of 3, stride of 2, padding of 1, and channels increase from 256 to 1024. Assuming we input a 1024×1024 image, the DeepEncoder will segment it into 1024/16×1024/16=4096 patch tokens. Since the first half of encoder is dominated by window attention and only 80M, the activation is acceptable. Before entering global attention,<endofsentence>
Figure 4 | To test model performance under different compression ratios (requiring different numbers of vision tokens) and enhance the practicality of DeepSeek-OCR, we configure it with multiple resolution modes.
the 4096 tokens go through the compression module and the token count becomes 4096/16=256, thus making the overall activation memory controllable.
Table 1 | Multi resolution support of DeepEncoder. For both research and application purposes, we design DeepEncoder with diverse native resolution and dynamic resolution modes.
<table><tr><td rowspan="2">Mode</td><td colspan="4">Native Resolution</td><td colspan="2">Dynamic Resolution</td></tr><tr><td>Tiny</td><td>Small</td><td>Base</td><td>Large</td><td>Gundam</td><td>Gundam-M</td></tr><tr><td>Resolution</td><td>512</td><td>640</td><td>1024</td><td>1280</td><td>640+1024</td><td>1024+1280</td></tr><tr><td>Tokens</td><td>64</td><td>100</td><td>256</td><td>400</td><td>n×100+256</td><td>n×256+400</td></tr><tr><td>Process</td><td>resize</td><td>resize</td><td>padding</td><td>padding</td><td>resize + padding</td><td>resize + padding</td></tr></table>
## 3.2.2. Multiple resolution support
Suppose we have an image with 1000 optical characters and we want to test how many vision tokens are needed for decoding. This requires the model to support a variable number of vision tokens. That is to say the DeepEncoder needs to support multiple resolutions.
We meet the requirement aforementioned through dynamic interpolation of positional encodings, and design several resolution modes for simultaneous model training to achieve the capability of a single DeepSeek-OCR model supporting multiple resolutions. As shown in Figure 4, DeepEncoder mainly supports two major input modes: native resolution and dynamic resolution. Each of them contains multiple sub-modes.
Native resolution supports four sub-modes: Tiny, Small, Base, and Large, with corresponding resolutions and token counts of \( 512 \times 512 \) (64), \( 640 \times 640 \) (100), \( 1024 \times 1024 \) (256), and \( 1280 \times 1280 \) (400) respectively. Since Tiny and Small modes have relatively small resolutions, to avoid wasting vision tokens, images are processed by directly resizing the original shape. For Base and Large modes, in order to preserve the original image aspect ratio, images are padded to the corresponding size. After padding, the number of valid vision tokens is less than the actual number of vision tokens, with the calculation formula being:
\[ N_{valid}=\lceil N_{actual}\times[1-((max(w,h)-min(w,h))/(max(w,h)))]\rceil \quad (1) \]
where w and h represent the width and height of the original input image.<endofsentence>
Dynamic resolution can be composed of two native resolutions. For example, Gundam mode consists of \( n \times 640 \times 640 \) tiles (local views) and a \( 1024 \times 1024 \) global view. The tiling method following InternVL2.0 [8]. Supporting dynamic resolution is mainly for application considerations, especially for ultra-high-resolution inputs (such as newspaper images). Tiling is a form of secondary window attention that can effectively reduce activation memory further. It's worth noting that due to our relatively large native resolutions, images won't be fragmented too much under dynamic resolution (the number of tiles is controlled within the range of 2 to 9). The vision token number output by the DeepEncoder under Gundam mode is: \( n \times 100 + 256 \) , where n is the number of tiles. For images with both width and height smaller than 640, n is set to 0, i.e., Gundam mode will degrade to Base mode.
Gundam mode is trained together with the four native resolution modes to achieve the goal of one model supporting multiple resolutions. Note that Gundam-master mode (1024×1024 local views+1280×1280 global view) is obtained through continued training on a trained DeepSeek-OCR model. This is mainly for load balancing, as Gundam-master's resolution is too large and training it together would slow down the overall training speed.
## 3.3. The MoE Decoder
Our decoder uses the DeepSeekMoE \( [19, 20] \) , specifically DeepSeek-3B-MoE. During inference, the model activates 6 out of 64 routed experts and 2 shared experts, with about 570M activated parameters. The 3B DeepSeekMoE is very suitable for domain-centric (OCR for us) VLM research, as it obtains the expressive capability of a 3B model while enjoying the inference efficiency of a 500M small model.
The decoder reconstructs the original text representation from the compressed latent vision tokens of DeepEncoder as:
\[ f_{\mathrm{dec}}:\mathbb{R}^{n\times d_{\mathrm{latent}}}\rightarrow\mathbb{R}^{N\times d_{\mathrm{text}}},\quad\hat{\mathbf{X}}=f_{\mathrm{dec}}(\mathbf{Z})\quad where n\leq N \quad (2) \]
where \( Z \in R^{n \times d_{latent}} \) are the compressed latent (vision) tokens from DeepEncoder and \( \hat{X} \in R^{N \times d_{text}} \) is the reconstructed text representation. The function \( f_{dec} \) represents a non-linear mapping that can be effectively learned by compact language models through OCR-style training. It is reasonable to conjecture that LLMs, through specialized pretraining optimization, would demonstrate more natural integration of such capabilities.
## 3.4. Data Engine
We construct complex and diverse training data for DeepSeek-OCR, including OCR 1.0 data, which mainly consists of traditional OCR tasks such as scene image OCR and document OCR; OCR 2.0 data, which mainly includes parsing tasks for complex artificial images, such as common charts, chemical formulas, and plane geometry parsing data; General vision data, which is mainly used to inject certain general image understanding capabilities into DeepSeek-OCR and preserve the general vision interface.
## 3.4.1. OCR 1.0 data
Document data is the top priority for DeepSeek-OCR. We collect 30M pages of diverse PDF data covering about 100 languages from the Internet, with Chinese and English accounting for approximately 25M and other languages accounting for 5M. For this data, we create two types of ground truth: coarse annotations and fine annotations. Coarse annotations are extracted<endofsentence>
<ref>text</ref>[<ref>][<ref>][<ref>][<ref>][<ref>][<ref>]
Figure 5 | OCR 1.0 fine annotations display. We format the ground truth into an interleaved layout and text format, where each paragraph of text is preceded by the coordinates and label of it in the original image. All coordinates are normalized into 1000 bins.
directly from the full dataset using fitz, aimed at teaching the model to recognize optical text, especially in minority languages. Fine annotations include 2M pages each for Chinese and English, labeled using advanced layout models (such as PP-DocLayout [33]) and OCR models (such as MinuerU [34] and GOT-OCR2.0 [38]) to construct detection and recognition interleaved data. For minority languages, in the detection part, we find that the layout model enjoys certain generalization capabilities. In the recognition part, we use fitz to create small patch data to train a GOT-OCR2.0, then use the trained model to label small patches after layout processing, employing a model flywheel to create 600K data samples. During the training of DeepSeekOCR, coarse labels and fine labels are distinguished using different prompts. The ground truth for fine annotation image-text pairs can be seen in Figure 5. We also collect 3M Word data, constructing high-quality image-text pairs without layout by directly extracting content. This data mainly brings benefits to formulas and HTML-formatted tables. Additionally, we select some open-source data [28, 37] as supplements.
For natural scene OCR, our model mainly supports Chinese and English. The image data sources come from LAION [31] and Wukong [13], labeled using PaddleOCR [9], with 10M data samples each for Chinese and English. Like document OCR, natural scene OCR can also control whether to output detection boxes through prompts.
## 3.4.2. OCR 2.0 data
Following GOT-OCR2.0 [38], we refer to chart, chemical formula, and plane geometry parsing data as OCR 2.0 data. For chart data, following OneChart [7], we use pyecharts and matplotlib<endofsentence>
(a) Image-text ground truth of chart
(b) Image-text ground truth of geometry
Figure 6 | For charts, we do not use OneChart's [7] dictionary format, but instead use HTML table format as labels, which can save a certain amount of tokens. For plane geometry, we convert the ground truth to dictionary format, where the dictionary contains keys such as line segments, endpoint coordinates, line segment types, etc., for better readability. Each line segment is encoded using the Slow Perception [39] manner.
to render 10M images, mainly including commonly used line, bar, pie, and composite charts. We define chart parsing as image-to-HTML-table conversion task, as shown in Figure 6(a). For chemical formulas, we utilize SMILES format from PubChem as the data source and render them into images using RDKit, constructing 5M image-text pairs. For plane geometry images, we follow Slow Perception [39] for generation. Specifically, we use perception-ruler size as 4 to model each line segment. To increase the diversity of rendered data, we introduce geometric translation-invariant data augmentation, where the same geometric image is translated in the original image, corresponding to the same ground truth drawn at the centered position in the coordinate system. Based on this, we construct a total of 1M plane geometry parsing data, as illustrated in Figure 6(b).
## 3.4.3. General vision data
DeepEncoder can benefit from CLIP's pretraining gains and has sufficient parameters to incorporate general visual knowledge. Therefore, we also prepare some corresponding data for DeepSeek-OCR. Following DeepSeek-VL2 [40], we generate relevant data for tasks such as caption, detection, and grounding. Note that DeepSeek-OCR is not a general VLM model, and this portion of data accounts for only 20% of the total data. We introduce such type of data mainly to preserve the general vision interface, so that researchers interested in our model and general vision task can conveniently advance their work in the future.
## 3.4.4. Text-only data
To ensure the model’s language capabilities, we introduced 10% of in-house text-only pretrain data, with all data processed to a length of 8192 tokens, which is also the sequence length for DeepSeek-OCR. In summary, when training DeepSeek-OCR, OCR data accounts for 70%, general vision data accounts for 20%, and text-only data accounts for 10%.
## 3.5. Training Pipelines
Our training pipeline is very simple and consists mainly of two stages: a). Training DeepEncoder independently; b). Training the DeepSeek-OCR. Note that the Gundam-master mode is obtained by continuing training on a pre-trained DeepSeek-OCR model with 6M sampled data. Since the training protocol is identical to other modes, we omit the detailed description hereafter.<endofsentence>
## 3.5.1. Training DeepEncoder
Following Vary [36], we utilize a compact language model [15] and use the next token prediction framework to train DeepEncoder. In this stage, we use all OCR 1.0 and 2.0 data aforementioned, as well as 100M general data sampled from the LAION [31] dataset. All data is trained for 2 epochs with a batch size of 1280, using the AdamW [23] optimizer with cosine annealing scheduler [22] and a learning rate of \( 5 \times 10^{-5} \) . The training sequence length is 4096.
## 3.5.2. Training DeepSeek-OCR
After DeepEncoder is ready, we use data mentioned in Section 3.4 to train the DeepSeek-OCR with the entire training process conducted on the HAI-LLM \( [14] \) platform. The entire model uses pipeline parallelism (PP) and is divided into 4 parts, with DeepEncoder taking two parts and the decoder taking two parts. For DeepEncoder, we treat SAM and the compressor as the vision tokenizer, place them in PP0 and freeze their parameters, while treating the CLIP part as input embedding layer and place it in PP1 with unfrozen weights for training. For the language model part, since DeepSeek3B-MoE has 12 layers, we place 6 layers each on PP2 and PP3. We use 20 nodes (each with 8 A100-40G GPUs) for training, with a data parallelism (DP) of 40 and a global batch size of 640. We use the AdamW optimizer with a step-based scheduler and an initial learning rate of \( 3 \times 10^{-5} \) . For text-only data, the training speed is 90B tokens/day, while for multimodal data, the training speed is 70B tokens/day.
Table 2 | We test DeepSeek-OCR's vision-text compression ratio using all English documents with 600-1300 tokens from the Fox \( [21] \) benchmarks. Text tokens represent the number of tokens after tokenizing the ground truth text using DeepSeek-OCR's tokenizer. Vision Tokens=64 or 100 respectively represent the number of vision tokens output by DeepEncoder after resizing input images to \( 512 \times 512 \) and \( 640 \times 640 \) .
<table><tr><td rowspan="2">Text Tokens</td><td colspan="2">Vision Tokens = 64</td><td colspan="2">Vision Tokens = 100</td><td rowspan="2">Pages</td></tr><tr><td>Precision</td><td>Compression</td><td>Precision</td><td>Compression</td></tr><tr><td>600-700</td><td>96.5%</td><td>10.5×</td><td>98.5%</td><td>6.7×</td><td>7</td></tr><tr><td>700-800</td><td>93.8%</td><td>11.8×</td><td>97.3%</td><td>7.5×</td><td>28</td></tr><tr><td>800-900</td><td>83.8%</td><td>13.2×</td><td>96.8%</td><td>8.5×</td><td>28</td></tr><tr><td>900-1000</td><td>85.9%</td><td>15.1×</td><td>96.8%</td><td>9.7×</td><td>14</td></tr><tr><td>1000-1100</td><td>79.3%</td><td>16.5×</td><td>91.5%</td><td>10.6×</td><td>11</td></tr><tr><td>1100-1200</td><td>76.4%</td><td>17.7×</td><td>89.8%</td><td>11.3×</td><td>8</td></tr><tr><td>1200-1300</td><td>59.1%</td><td>19.7×</td><td>87.1%</td><td>12.6×</td><td>4</td></tr></table>
## 4. Evaluation
sub_title</td></tr></table>
## 4.1. Vision-text Compression Study
We select Fox \( [21] \) benchmarks to verify DeepSeek-OCR's compression-decompression capability for text-rich documents, in order to preliminarily explore the feasibility and boundaries of contexts optical compression. We use the English document portion of Fox, tokenize the ground truth text with DeepSeek-OCR's tokenizer (vocabulary size of approximately 129k), and select documents with 600-1300 tokens for testing, which happens to be 100 pages. Since the number of text tokens is not large, we only need to test performance in Tiny and Small modes, where Tiny mode corresponds to 64 tokens and Small mode corresponds to 100 tokens. We use the prompt<endofsentence>
<table><tr><td colspan="11">Table 3 | We use OmniDocBench [27] to test the performance of DeepSeek-OCR on real document parsing tasks. All metrics in the table are edit distances, where smaller values indicate better performance. &quot;Tokens&quot; represents the average number of vision tokens used per page, and &quot;†200dpi&quot; means using fitz to interpolate the original image to 200dpi. For the DeepSeek-OCR model, the values in parentheses in the &quot;Tokens&quot; column represent valid vision tokens, calculated according to Equation 1.</td></tr><tr><td colspan="11">table</td></tr><tr><td rowspan="2">Model</td><td rowspan="2">Tokens</td><td colspan="4">English</td><td colspan="5">Chinese</td></tr><tr><td colspan="4">overall text formula table order</td><td colspan="2">overall text formula table order</td><td colspan="3">overall text formula table order</td></tr><tr><td colspan="11">Pipeline Models</td></tr><tr><td>Dolphin [11]</td><td>-</td><td>0.356</td><td>0.352</td><td>0.465</td><td>0.258</td><td>0.35</td><td>0.44</td><td>0.44</td><td>0.604</td><td>0.367</td><td>0.351</td></tr><tr><td>Marker [1]</td><td>-</td><td>0.296</td><td>0.085</td><td>0.374</td><td>0.609</td><td>0.116</td><td>0.497</td><td>0.293</td><td>0.688</td><td>0.678</td><td>0.329</td></tr><tr><td>Mathpix [2]</td><td>-</td><td>0.191</td><td>0.105</td><td>0.306</td><td>0.243</td><td>0.108</td><td>0.364</td><td>0.381</td><td>0.454</td><td>0.32</td><td>0.30</td></tr><tr><td>MinerU-2.1.1 [34]</td><td>-</td><td>0.162</td><td>0.072</td><td>0.313</td><td>0.166</td><td>0.097</td><td>0.244</td><td>0.111</td><td>0.581</td><td>0.15</td><td>0.136</td></tr><tr><td>MonkeyOCR-1.2B [18]</td><td>-</td><td>0.154</td><td>0.062</td><td>0.295</td><td>0.164</td><td>0.094</td><td>0.263</td><td>0.179</td><td>0.464</td><td>0.168</td><td>0.243</td></tr><tr><td>PPstructure-v3 [9]</td><td>-</td><td>0.152</td><td>0.073</td><td>0.295</td><td>0.162</td><td>0.077</td><td>0.223</td><td>0.136</td><td>0.535</td><td>0.111</td><td>0.11</td></tr></table>
<table><tr><td rowspan="2">Model</td><td rowspan="2">Tokens</td><td colspan="4">English</td><td colspan="5">Chinese</td></tr><tr><td colspan="4">overall text formula table order</td><td colspan="2">overall text formula table order</td><td colspan="3">overall text formula table order</td></tr><tr><td colspan="11">Pipeline Models</td></tr><tr><td>Dolphin [11]</td><td>-</td><td>0.356</td><td>0.352</td><td>0.465</td><td>0.258</td><td>0.35</td><td>0.44</td><td>0.44</td><td>0.604</td><td>0.367</td><td>0.351</td></tr><tr><td>Marker [1]</td><td>-</td><td>0.296</td><td>0.085</td><td>0.374</td><td>0.609</td><td>0.116</td><td>0.497</td><td>0.293</td><td>0.688</td><td>0.678</td><td>0.329</td></tr><tr><td>Mathpix [2]</td><td>-</td><td>0.191</td><td>0.105</td><td>0.306</td><td>0.243</td><td>0.108</td><td>0.364</td><td>0.381</td><td>0.454</td><td>0.32</td><td>0.30</td></tr><tr><td>MinerU-2.1.1 [34]</td><td>-</td><td>0.162</td><td>0.072</td><td>0.313</td><td>0.166</td><td>0.097</td><td>0.244</td><td>0.111</td><td>0.581</td><td>0.15</td><td>0.136</td></tr><tr><td>MonkeyOCR-1.2B [18]</td><td>-</td><td>0.154</td><td>0.062</td><td>0.295</td><td>0.164</td><td>0.094</td><td>0.263</td><td>0.179</td><td>0.464</td><td>0.168</td><td>0.243</td></tr><tr><td>PPstructure-v3 [9]</td><td>-</td><td>0.152</td><td>0.073</td><td>0.295</td><td>0.162</td><td>0.077</td><td>0.223</td><td>0.136</td><td>0.535</td><td>0.111</td><td>0.11</td></tr><tr><td colspan="11">End-to-end Models</td></tr><tr><td>Nougat [6]</td><td>2352</td><td>0.452</td><td>0.365</td><td>0.488</td><td>0.572</td><td>0.382</td><td>0.973</td><td>0.998</td><td>0.941</td><td>1.00</td><td>0.954</td></tr><tr><td>SmolDocling [25]</td><td>392</td><td>0.493</td><td>0.262</td><td>0.753</td><td>0.729</td><td>0.227</td><td>0.816</td><td>0.838</td><td>0.997</td><td>0.907</td><td>0.522</td></tr><tr><td>InternVL2-76B [8]</td><td>6790</td><td>0.44</td><td>0.353</td><td>0.543</td><td>0.547</td><td>0.317</td><td>0.443</td><td>0.29</td><td>0.701</td><td>0.555</td><td>0.228</td></tr><tr><td>Qwen2.5-VL-7B [5]</td><td>3949</td><td>0.316</td><td>0.151</td><td>0.376</td><td>0.598</td><td>0.138</td><td>0.399</td><td>0.243</td><td>0.5</td><td>0.627</td><td>0.226</td></tr><tr><td>OLMOCR [28]</td><td>3949</td><td>0.326</td><td>0.097</td><td>0.455</td><td>0.608</td><td>0.145</td><td>0.469</td><td>0.293</td><td>0.655</td><td>0.652</td><td>0.277</td></tr><tr><td>GOT-OCR2.0 [38]</td><td>256</td><td>0.287</td><td>0.189</td><td>0.360</td><td>0.459</td><td>0.141</td><td>0.411</td><td>0.315</td><td>0.528</td><td>0.52</td><td>0.28</td></tr><tr><td>OCRFlux-3B [3]</td><td>3949</td><td>0.238</td><td>0.112</td><td>0.447</td><td>0.269</td><td>0.126</td><td>0.349</td><td>0.256</td><td>0.716</td><td>0.162</td><td>0.263</td></tr><tr><td>GPT4o [26]</td><td>-</td><td>0.233</td><td>0.144</td><td>0.425</td><td>0.234</td><td>0.128</td><td>0.399</td><td>0.409</td><td>0.606</td><td>0.329</td><td>0.251</td></tr><tr><td>InternVL3-78B [42]</td><td>6790</td><td>0.218</td><td>0.117</td><td>0.38</td><td>0.279</td><td>0.095</td><td>0.296</td><td>0.21</td><td>0.533</td><td>0.282</td><td>0.161</td></tr><tr><td>Qwen2.5-VL-72B [5]</td><td>3949</td><td>0.214</td><td>0.092</td><td>0.315</td><td>0.341</td><td>0.106</td><td>0.261</td><td>0.18</td><td>0.434</td><td>0.262</td><td>0.168</td></tr><tr><td>dots.ocr [30]</td><td>3949</td><td>0.182</td><td>0.137</td><td>0.320</td><td>0.166</td><td>0.182</td><td>0.261</td><td>0.229</td><td>0.468</td><td>0.160</td><td>0.261</td></tr><tr><td>Gemini2.5-Pro [4]</td><td>-</td><td>0.148</td><td>0.055</td><td>0.356</td><td>0.13</td><td>0.049</td><td>0.212</td><td>0.168</td><td>0.439</td><td>0.119</td><td>0.121</td></tr><tr><td>MinerU2.0 [34]</td><td>6790</td><td>0.133</td><td>0.045</td><td>0.273</td><td>0.15</td><td>0.066</td><td>0.238</td><td>0.115</td><td>0.506</td><td>0.209</td><td>0.122</td></tr><tr><td>dots.ocr†200dpi [30]</td><td>5545</td><td>0.125</td><td>0.032</td><td>0.329</td><td>0.099</td><td>0.04</td><td>0.16</td><td>0.066</td><td>0.416</td><td>0.092</td><td>0.067</td></tr><tr><td colspan="11">DeepSeek-OCR (end2end)</td></tr><tr><td>Tiny</td><td>64</td><td>0.386</td><td>0.373</td><td>0.469</td><td>0.422</td><td>0.283</td><td>0.361</td><td>0.307</td><td>0.635</td><td>0.266</td><td>0.236</td></tr><tr><td>Small</td><td>100</td><td>0.221</td><td>0.142</td><td>0.373</td><td>0.242</td><td>0.125</td><td>0.284</td><td>0.24</td><td>0.53</td><td>0.159</td><td>0.205</td></tr><tr><td>Base</td><td>256(182)</td><td>0.137</td><td>0.054</td><td>0.267</td><td>0.163</td><td>0.064</td><td>0.24</td><td>0.205</td><td>0.474</td><td>0.1</td><td>0.181</td></tr><tr><td>Large</td><td>400(285)</td><td>0.138</td><td>0.054</td><td>0.277</td><td>0.152</td><td>0.067</td><td>0.208</td><td>0.143</td><td>0.461</td><td>0.104</td><td>0.123</td></tr><tr><td>Gundam</td><td>795</td><td>0.127</td><td>0.043</td><td>0.269</td><td>0.134</td><td>0.062</td><td>0.181</td><td>0.097</td><td>0.432</td><td>0.089</td><td>0.103</td></tr><tr><td>Gundam-M†200dpi</td><td>1853</td><td>0.123</td><td>0.049</td><td>0.242</td><td>0.147</td><td>0.056</td><td>0.157</td><td>0.087</td><td>0.377</td><td>0.08</td><td>0.085</td></tr></table>
without layout: " \( \langle \) image \( \rangle \) \times nFree OCR." to control the model's output format. Nevertheless, the output format still cannot completely match Fox benchmarks, so the actual performance would be somewhat higher than the test results.
As shown in Table 2, within a \( 10\times \) compression ratio, the model's decoding precision can reach approximately 97%, which is a very promising result. In the future, it may be possible to achieve nearly \( 10\times \) lossless contexts compression through text-to-image approaches. When the compression ratio exceeds \( 10\times \) , performance begins to decline, which may have two reasons: one is that the layout of long documents becomes more complex, and another reason may be that long texts become blurred at \( 512\times512 \) or \( 640\times640 \) resolution. The first issue can be solved by rendering texts onto a single layout page, while we believe the second issue will become<endofsentence>
a feature of the forgetting mechanism. When compressing tokens by nearly 20×, we find that precision can still approach 60%. These results indicate that optical contexts compression is a very promising and worthwhile research direction, and this approach does not bring any overhead because it can leverage VLM infrastructure, as multimodal systems inherently require an additional vision encoder.
Table 4 | Edit distances for different categories of documents in OmniDocBench. The results show that some types of documents can achieve good performance with just 64 or 100 vision tokens, while others require Gundam mode.
<table><tr><td>Mode</td><td>Type</td><td>Book Slides</td><td>Financial Report</td><td>Textbook</td><td>Exam Paper</td><td>Magazine</td><td>Academic Papers</td><td>Notes</td><td>Newspaper</td><td>Overall</td></tr><tr><td>Tiny</td><td>0.147</td><td>0.116</td><td>0.207</td><td>0.173</td><td>0.294</td><td>0.201</td><td>0.395</td><td>0.297</td><td>0.94</td><td>0.32</td></tr><tr><td>Small</td><td>0.085</td><td>0.111</td><td>0.079</td><td>0.147</td><td>0.171</td><td>0.107</td><td>0.131</td><td>0.187</td><td>0.744</td><td>0.205</td></tr><tr><td>Base</td><td>0.037</td><td>0.08</td><td>0.027</td><td>0.1</td><td>0.13</td><td>0.073</td><td>0.052</td><td>0.176</td><td>0.645</td><td>0.156</td></tr><tr><td>Large</td><td>0.038</td><td>0.108</td><td>0.022</td><td>0.084</td><td>0.109</td><td>0.06</td><td>0.053</td><td>0.155</td><td>0.353</td><td>0.117</td></tr><tr><td>Gundam</td><td>0.035</td><td>0.085</td><td>0.289</td><td>0.095</td><td>0.094</td><td>0.059</td><td>0.039</td><td>0.153</td><td>0.122</td><td>0.083</td></tr><tr><td>Guandam-M</td><td>0.052</td><td>0.09</td><td>0.034</td><td>0.091</td><td>0.079</td><td>0.079</td><td>0.048</td><td>0.1</td><td>0.099</td><td>0.077</td></tr></table>
## 4.2 OCR Practical Performance
DeepSeek-OCR is not only an experimental model; it has strong practical capabilities and can construct data for LLM/VLM pretraining. To quantify OCR performance, we test DeepSeek-OCR on OmniDocBench \( [27] \) , with results shown in Table 3. Requiring only 100 vision tokens (640×640 resolution), DeepSeek-OCR surpasses GOT-OCR2.0 \( [38] \) which uses 256 tokens; with 400 tokens (285 valid tokens, 1280×1280 resolution), it achieves on-par performance with state-of-the-arts on this benchmark. Using fewer than 800 tokens (Gundam mode), DeepSeek-OCR outperforms MinerU2.0 \( [34] \) which needs nearly 7,000 vision tokens. These results demonstrate that our DeepSeek-OCR model is powerful in practical applications, and because the higher tokens compression, it enjoys a higher research ceiling.
As shown in Table 4, some categories of documents require very few tokens to achieve satisfactory performance, such as slides which only need 64 vision tokens. For book and report documents, DeepSeek-OCR can achieve good performance with only 100 vision tokens. Combined with the analysis from Section 4.1, this may be because most text tokens in these document categories are within 1,000, meaning the vision-token compression ratio does not exceed 10×. For newspapers, Gundam or even Gundam-master mode is required to achieve acceptable edit distances, because the text tokens in newspapers are 4-5,000, far exceeding the 10× compression of other modes. These experimental results further demonstrate the boundaries of contexts optical compression, which may provide effective references for researches on the vision token optimization in VLMs and context compression, forgetting mechanisms in LLMs.
## 4.3. Qualitative Study
## 4.3.1. Deep parsing
DeepSeek-OCR possesses both layout and OCR 2.0 capabilities, enabling it to further parse images within documents through secondary model calls, a feature we refer to as "deep parsing". As shown in Figures 7,8,9,10, our model can perform deep parsing on charts, geometry, chemical formulas, and even natural images, requiring only a unified prompt.<endofsentence>
Figure 7 | In the field of financial research reports, the deep parsing mode of DeepSeek-OCR can be used to obtain structured results of charts within documents. Charts are a crucial form of data representation in finance and scientific fields, and the chart structured extraction is an indispensable capability for future OCR models.<endofsentence>
image_caption
image_caption
<center>Figure 8 | For books and articles, the deep parsing mode can output dense captions for natural images in the documents. With just a prompt, the model can automatically identify what type of image it is and output the required results. </center><endofsentence>
Figure 9 | DeepSeek-OCR in deep parsing mode can also recognize chemical formulas within chemical documents and convert them to SMILES format. In the future, OCR 1.0+2.0 technology may play a significant role in the development of VLM/LLM in STEM fields.<endofsentence>
Figure 10 | DeepSeek-OCR also possesses the capability to copy (structure) simple planar geometric figures. Due to the intricate interdependencies among line segments in geometric shapes, parsing geometry task is extremely challenging and has a long way to go.
## 4.3.2. Multilingual recognition
PDF data on the Internet contains not only Chinese and English, but also a large amount of multilingual data, which is also crucial when training LLMs. For PDF documents, DeepSeekOCR can handle nearly 100 languages. Like Chinese and English documents, multilingual data also supports both layout and non-layout OCR formats. The visualization results are shown in Figure 11, where we select Arabic and Sinhala languages to demonstrate results.<endofsentence>
Figure 11 | To endow the capability of processing widely crawled PDFs (multilingual data), we train our model with OCR capabilities for nearly 100 languages. Minority language documents can also support both layout and non-layout outputs through different prompts.
## 4.3.3. General vision understanding
We also provide DeepSeek-OCR with a certain degree of general image understanding capabilities. The related visualization results are shown in Figure 12.<endofsentence>
Figure 12 | We retain DeepSeek-OCR's capabilities in general visual understanding, mainly including image description, object detection, grounding, etc. Meanwhile, due to the inclusion of text-only data, DeepSeek-OCR's language capabilities are also retained. Note that since we do not include SFT (Supervised Fine-Tuning) stage, the model is not a chatbot, and some capabilities need completion prompts to be activated.
## 5. Discussion
Our work represents an initial exploration into the boundaries of vision-text compression, investigating how many vision tokens are required to decode N text tokens. The preliminary results are encouraging: DeepSeek-OCR achieves near-lossless OCR compression at approximately 10× ratios, while 20× compression still retains 60% accuracy. These findings suggest promising directions for future applications, such as implementing optical processing for dialogue histories beyond k rounds in multi-turn conversations to achieve 10× compression efficiency.<endofsentence>
Figure 13 | Forgetting mechanisms constitute one of the most fundamental characteristics of human memory. The contexts optical compression approach can simulate this mechanism by rendering previous rounds of historical text onto images for initial compression, then progressively resizing older images to achieve multi-level compression, where token counts gradually decrease and text becomes increasingly blurred, thereby accomplishing textual forgetting.
For older contexts, we could progressively downsizing the rendered images to further reduce token consumption. This assumption draws inspiration from the natural parallel between human memory decay over time and visual perception degradation over spatial distance—both exhibit similar patterns of progressive information loss, as shown in Figure 13. By combining these mechanisms, contexts optical compression method enables a form of memory decay that mirrors biological forgetting curves, where recent information maintains high fidelity while distant memories naturally fade through increased compression ratios.
While our initial exploration shows potential for scalable ultra-long context processing, where recent contexts preserve high resolution and older contexts consume fewer resources, we acknowledge this is early-stage work that requires further investigation. The approach suggests a path toward theoretically unlimited context architectures that balance information retention with computational constraints, though the practical implications and limitations of such vision-text compression systems warrant deeper study in future research.
## sub_title
6. Conclusion
In this technical report, we propose DeepSeek-OCR and preliminarily validate the feasibility of contexts optical compression through this model, demonstrating that the model can effectively decode text tokens exceeding 10 times the quantity from a small number of vision tokens. We believe this finding will facilitate the development of VLMs and LLMs in the future. Additionally, DeepSeek-OCR is a highly practical model capable of large-scale pretraining data production, serving as an indispensable assistant for LLMs. Of course, OCR alone is insufficient to fully validate true context optical compression and we will conduct digital-optical text interleaved pretraining, needle-in-a-haystack testing, and other evaluations in the future. From another perspective, optical contexts compression still offers substantial room for research and improvement, representing a promising new direction.<endofsentence>
## References
[1] Marker. URL https://github.com/datalab-to/marker.
[2] Mathpix. URL https://mathpix.com/.
[3] Ocrflux, 2025. URL https://github.com/chatdoc-com/OCRFlux.
[4] G. AI. Gemini 2.5-pro, 2025. URL https://gemini.google.com/.
[5] S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, H. Zhong, Y. Zhu, M. Yang, Z. Li, J. Wan, P. Wang, W. Ding, Z. Fu, Y. Xu, J. Ye, X. Zhang, T. Xie, Z. Cheng, H. Zhang, Z. Yang, H. Xu, and J. Lin. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923, 2025.
[6] L. Blecher, G. Cucurull, T. Scialom, and R. Stojnic. Nougat: Neural optical understanding for academic documents. arXiv preprint arXiv:2308.13418, 2023.
[7] J. Chen, L. Kong, H. Wei, C. Liu, Z. Ge, L. Zhao, J. Sun, C. Han, and X. Zhang. Onechart: Purify the chart structural extraction via one auxiliary token. In Proceedings of the 32nd ACM International Conference on Multimedia, pages 147–155, 2024.
[8] Z. Chen, W. Wang, H. Tian, S. Ye, Z. Gao, E. Cui, W. Tong, K. Hu, J. Luo, Z. Ma, et al. How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. arXiv preprint arXiv:2404.16821, 2024.
[9] C. Cui, T. Sun, M. Lin, T. Gao, Y. Zhang, J. Liu, X. Wang, Z. Zhang, C. Zhou, H. Liu, et al. Paddleocr 3.0 technical report. arXiv preprint arXiv:2507.05595, 2025.
[10] M. Dehghani, J. Djolonga, B. Mustafa, P. Padlewski, J. Heek, J. Gilmer, A. Steiner, M. Caron, R. Geirhos, I. Alabdulmohsin, et al. Patch n' pack: Navit, a vision transformer for any aspect ratio and resolution. Advances in Neural Information Processing Systems, 36:3632–3656, 2023.
[11] H. Feng, S. Wei, X. Fei, W. Shi, Y. Han, L. Liao, J. Lu, B. Wu, Q. Liu, C. Lin, et al. Dolphin: Document image parsing via heterogeneous anchor prompting. arXiv preprint arXiv:2505.14059, 2025.
[12] Y. Goyal, T. Khot, D. Summers-Stay, D. Batra, and D. Parikh. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 6904–6913, 2017.
[13] J. Gu, X. Meng, G. Lu, L. Hou, N. Minzhe, X. Liang, L. Yao, R. Huang, W. Zhang, X. Jiang, et al. Wukong: A 100 million large-scale chinese cross-modal pre-training benchmark. Advances in Neural Information Processing Systems, 35:26418–26431, 2022.
[14] High-flyer. HAI-LLM: Efficient and lightweight training tool for large models, 2023. URL https://www.high-flyer.cn/en/blog/hai-llm.
[15] S. Iyer, X. V. Lin, R. Pasunuru, T. Mihaylov, D. Simig, P. Yu, K. Shuster, T. Wang, Q. Liu, P. S. Koura, et al. Opt-iml: Scaling language model instruction meta learning through the lens of generalization. arXiv preprint arXiv:2212.12017, 2022.
[16] S. Kazemzadeh, V. Ordonez, M. Matten, and T. Berg. Referitgame: Referring to objects in photographs of natural scenes. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pages 787–798, 2014.<endofsentence>
[17] A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A. C. Berg, W.-Y. Lo, et al. Segment anything. arXiv preprint arXiv:2304.02643, 2023.
[18] Z. Li, Y. Liu, Q. Liu, Z. Ma, Z. Zhang, S. Zhang, Z. Guo, J. Zhang, X. Wang, and X. Bai. Monkeycor: Document parsing with a structure-recognition-relation triplet paradigm. arXiv preprint arXiv:2506.05218, 2025.
[19] A. Liu, B. Feng, B. Wang, B. Wang, B. Liu, C. Zhao, C. Dengr, C. Ruan, D. Dai, D. Guo, et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434, 2024.
[20] A. Liu, B. Feng, B. Xue, B. Wang, B. Wu, C. Lu, C. Zhao, C. Deng, C. Zhang, C. Ruan, et al. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437, 2024.
[21] C. Liu, H. Wei, J. Chen, L. Kong, Z. Ge, Z. Zhu, L. Zhao, J. Sun, C. Han, and X. Zhang. Focus anywhere for fine-grained multi-page document understanding. arXiv preprint arXiv:2405.14295, 2024.
[22] I. Loshchilov and F. Hutter. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983, 2016.
[23] I. Loshchilov and F. Hutter. Decoupled weight decay regularization. In ICLR, 2019.
[24] A. Masry, D. X. Long, J. Q. Tan, S. Joty, and E. Hoque. Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244, 2022.
[25] A. Nassar, A. Marafioti, M. Omenetti, M. Lysak, N. Livathinos, C. Auer, L. Morin, R. T. de Lima, Y. Kim, A. S. Gurbuz, et al. Smoldocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion. arXiv preprint arXiv:2503.11576, 2025.
[26] OpenAI. Gpt-4 technical report, 2023
[27] L. Ouyang, Y. Qu, H. Zhou, J. Zhu, R. Zhang, Q. Lin, B. Wang, Z. Zhao, M. Jiang, X. Zhao, et al. Omnidocbench: Benchmarking diverse pdf document parsing with comprehensive annotations. In Proceedings of the Computer Vision and Pattern Recognition Conference, pages 24838–24848, 2025.
[28] J. Poznanski, A. Rangapur, J. Borchardt, J. Dunkelberger, R. Huff, D. Lin, C. Wilhelm, K. Lo, and L. Soldaini. olmocr: Unlocking trillions of tokens in pdfs with vision language models. arXiv preprint arXiv:2502.18443, 2025.
[29] A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748–8763. PMLR, 2021.
[30] Rednote. dots.ocr, 2025. URL https://github.com/rednote-hilab/dots.ocr.
[31] C. Schuhmann, R. Vencu, R. Beaumont, R. Kaczmarczyk, C. Mullis, A. Katta, T. Coombes, J. Jitsev, and A. Komatsuzaki. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114, 2021.<endofsentence>
[32] A. Singh, V. Natarajan, M. Shah, Y. Jiang, X. Chen, D. Batra, D. Parikh, and M. Rohrbach. Towards vqa models that can read. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition, pages 8317–8326, 2019.
[33] T. Sun, C. Cui, Y. Du, and Y. Liu. Pp-doclayout: A unified document layout detection model to accelerate large-scale data construction. arXiv preprint arXiv:2503.17213, 2025.
[34] B. Wang, C. Xu, X. Zhao, L. Ouyang, F. Wu, Z. Zhao, R. Xu, K. Liu, Y. Qu, F. Shang, et al. Mineru: An open-source solution for precise document content extraction. arXiv preprint arXiv:2409.18839, 2024.
[35] P. Wang, S. Bai, S. Tan, S. Wang, Z. Fan, J. Bai, K. Chen, X. Liu, J. Wang, W. Ge, et al. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191, 2024.
[36] H. Wei, L. Kong, J. Chen, L. Zhao, Z. Ge, J. Yang, J. Sun, C. Han, and X. Zhang. Vary: Scaling up the vision vocabulary for large vision-language model. In European Conference on Computer Vision, pages 408–424. Springer, 2024.
[37] H. Wei, L. Kong, J. Chen, L. Zhao, Z. Ge, E. Yu, J. Sun, C. Han, and X. Zhang. Small language model meets with reinforced vision vocabulary. arXiv preprint arXiv:2401.12503, 2024.
[38] H. Wei, C. Liu, J. Chen, J. Wang, L. Kong, Y. Xu, Z. Ge, L. Zhao, J. Sun, Y. Peng, et al. General ocr theory: Towards ocr-2.0 via a unified end-to-end model. arXiv preprint arXiv:2409.01704, 2024.
[39] H. Wei, Y. Yin, Y. Li, J. Wang, L. Zhao, J. Sun, Z. Ge, X. Zhang, and D. Jiang. Slow perception: Let's perceive geometric figures step-by-step. arXiv preprint arXiv:2412.20631, 2024.
[40] Z. Wu, X. Chen, Z. Pan, X. Liu, W. Liu, D. Dai, H. Gao, Y. Ma, C. Wu, B. Wang, et al. Deepseek-vl2: Mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302, 2024.
[41] W. Yu, Z. Yang, L. Li, J. Wang, K. Lin, Z. Liu, X. Wang, and L. Wang. Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490, 2023.
[42] J. Zhu, W. Wang, Z. Chen, Z. Liu, S. Ye, L. Gu, H. Tian, Y. Duan, W. Su, J. Shao, et al. InternV13: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479, 2025.<endofsentence>
\ No newline at end of file
INFO 02-27 11:51:47 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py:476: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr2
INFO 02-27 11:51:53 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCR2ForCausalLM']}
INFO 02-27 11:51:53 [config.py:721] This model supports multiple tasks: {'score', 'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 02-27 11:51:53 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr2', speculative_config=None, tokenizer='/home/lst/deepseek_ocr2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-27 11:51:53 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-27 11:51:53 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-27 11:51:53 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-27 11:51:53 [worker_base.py:653] ########## 4675 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}
INFO 02-27 11:51:53 [worker_base.py:654] ########## 4675 process(rank0) is running on memnode(s): {0, 1, 2, 3, 4, 5, 6, 7}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0227 11:51:54.129271 4675 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0227 11:51:54.129348 4675 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0227 11:51:54.129817 4675 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x563cdcfcebb0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0227 11:51:54.129829 4675 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0227 11:51:54.150034 4675 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x563cdcfcebb0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0227 11:51:54.150072 4675 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0227 11:51:54.151399 4675 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x563cdcfcebb0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0227 11:51:54.151417 4675 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0227 11:51:54.152455 4675 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x563cdcfcebb0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0227 11:51:54.152474 4675 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-27 11:51:54 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-27 11:51:54 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr2...
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
INFO 02-27 11:51:55 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 3.78it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 3.78it/s]
INFO 02-27 11:51:58 [loader.py:460] Loading weights took 2.26 seconds
INFO 02-27 11:51:58 [model_runner.py:1165] Model loading took 6.3336 GiB and 3.979427 seconds
Some kwargs in processor config are unused and will not have any effect: add_special_token, ignore_id, candidate_resolutions, image_mean, image_token, sft_format, normalize, mask_prompt, image_std, downsample_ratio, patch_size, pad_token.
/home/lst/DeepSeek-OCR2-vllm/deepencoderv2/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-27 11:52:11 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-27 11:52:12 [worker.py:287] Memory profiling takes 13.74 seconds
INFO 02-27 11:52:12 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-27 11:52:12 [worker.py:287] model weights take 6.33GiB; non_torch_memory takes 1.58GiB; PyTorch activation peak memory takes 2.00GiB; the rest of the memory reserved for KV Cache is 47.67GiB.
INFO 02-27 11:52:12 [executor_base.py:112] # rocm blocks: 13017, # CPU blocks: 0
INFO 02-27 11:52:12 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 101.70x
INFO 02-27 11:52:12 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:02, 1.84it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.95it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 1.99it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.97it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 1.97it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.97it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.96it/s]
INFO 02-27 11:52:15 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-27 11:52:15 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 17.43 seconds
[SUCCESS] 模型加载完成
[INFO] 服务启动: http://0.0.0.0:8000
[INFO] 接口文档: http://0.0.0.0:8000/docs
INFO: Started server process [4675]
INFO: Waiting for application startup.
INFO: Application startup complete.
ERROR: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8000): address already in use
INFO: Waiting for application shutdown.
INFO: Application shutdown complete.
[INFO] 关闭线程池...
[SUCCESS] 线程池已关闭
INFO 02-04 17:53:14 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py:476: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr2
INFO 02-04 17:53:19 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCR2ForCausalLM']}
INFO 02-04 17:53:19 [config.py:721] This model supports multiple tasks: {'score', 'embed', 'reward', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 02-04 17:53:19 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr2', speculative_config=None, tokenizer='/home/lst/deepseek_ocr2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 17:53:19 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 17:53:19 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 17:53:19 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 17:53:19 [worker_base.py:653] ########## 46947 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 17:53:19 [worker_base.py:654] ########## 46947 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 17:53:20.008334 46947 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 17:53:20.008416 46947 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:53:20.008852 46947 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ed036ae440, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 17:53:20.008864 46947 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:53:20.028966 46947 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ed036ae440, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 17:53:20.029006 46947 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:53:20.030189 46947 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ed036ae440, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 17:53:20.030205 46947 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:53:20.031195 46947 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ed036ae440, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 17:53:20.031214 46947 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 17:53:20 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 17:53:20 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr2...
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
INFO 02-04 17:53:21 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.15it/s]
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 513, in <module>
[rank0]: main()
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 504, in main
[rank0]: initialize_model(args.model_path, args.gpu_id)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 272, in initialize_model
[rank0]: llm = LLM(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
[rank0]: return engine_cls.from_vllm_config(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
[rank0]: return cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 283, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 52, in __init__
[rank0]: self._init_executor()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
[rank0]: self.collective_rpc("load_model")
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 203, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1136, in load_model
[rank0]: self.model = get_model(vllm_config=self.vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
[rank0]: return loader.load_model(vllm_config=vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 457, in load_model
[rank0]: loaded_weights = model.load_weights(
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 576, in load_weights
[rank0]: autoloaded_weights = loader.load_weights(processed_weights, mapper=self.hf_to_vllm_mapper)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 261, in load_weights
[rank0]: autoloaded_weights = set(self._load_module("", self.module, weights))
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 222, in _load_module
[rank0]: yield from self._load_module(prefix,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 222, in _load_module
[rank0]: yield from self._load_module(prefix,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 231, in _load_module
[rank0]: yield from self._load_param(prefix, child_params[child_prefix],
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 154, in _load_param
[rank0]: weight_loader(param, weight_data)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/weight_utils.py", line 602, in default_weight_loader
[rank0]: assert param.size() == loaded_weight.size(), (
[rank0]: AssertionError: Attempted to load weight (torch.Size([1280, 896])) into parameter (torch.Size([1280, 2048]))
INFO 02-04 17:55:15 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py:476: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr2
INFO 02-04 17:55:20 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCR2ForCausalLM']}
INFO 02-04 17:55:20 [config.py:721] This model supports multiple tasks: {'reward', 'score', 'classify', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 02-04 17:55:20 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr2', speculative_config=None, tokenizer='/home/lst/deepseek_ocr2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 17:55:21 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 17:55:21 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 17:55:21 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 17:55:21 [worker_base.py:653] ########## 47119 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 17:55:21 [worker_base.py:654] ########## 47119 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 17:55:21.263231 47119 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 17:55:21.263304 47119 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:55:21.263877 47119 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55627668c630, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 17:55:21.263891 47119 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:55:21.283950 47119 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55627668c630, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 17:55:21.283991 47119 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:55:21.285181 47119 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55627668c630, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 17:55:21.285199 47119 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:55:21.286090 47119 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55627668c630, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 17:55:21.286103 47119 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 17:55:21 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 17:55:21 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr2...
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
INFO 02-04 17:55:22 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.23it/s]
INFO 02-04 17:55:24 [loader.py:460] Loading weights took 1.77 seconds
INFO 02-04 17:55:24 [model_runner.py:1165] Model loading took 6.3336 GiB and 3.418305 seconds
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_std, pad_token, image_token, image_mean, mask_prompt, sft_format, add_special_token, candidate_resolutions, ignore_id, patch_size, normalize.
/home/lst/DeepSeek-OCR2-vllm/deepencoderv2/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 17:55:36 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 17:55:37 [worker.py:287] Memory profiling takes 12.06 seconds
INFO 02-04 17:55:37 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-04 17:55:37 [worker.py:287] model weights take 6.33GiB; non_torch_memory takes 1.58GiB; PyTorch activation peak memory takes 2.00GiB; the rest of the memory reserved for KV Cache is 47.67GiB.
INFO 02-04 17:55:37 [executor_base.py:112] # rocm blocks: 13017, # CPU blocks: 0
INFO 02-04 17:55:37 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 101.70x
INFO 02-04 17:55:37 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:02, 1.91it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:01, 2.02it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 2.05it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:01<00:00, 2.03it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 2.03it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.02it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.02it/s]
INFO 02-04 17:55:40 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-04 17:55:40 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 15.64 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 2 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [47119]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_std, pad_token, image_token, image_mean, mask_prompt, sft_format, add_special_token, candidate_resolutions, ignore_id, patch_size, normalize.
[1/3] Tokenize 22 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 22 页...
Processed prompts: 0%| | 0/22 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▍ | 1/22 [00:15<05:22, 15.35s/it, est. speed input: 73.68 toks/s, output: 6.45 toks/s] Processed prompts: 9%|▉ | 2/22 [00:15<02:13, 6.70s/it, est. speed input: 141.49 toks/s, output: 12.45 toks/s] Processed prompts: 14%|█▎ | 3/22 [00:17<01:23, 4.40s/it, est. speed input: 192.12 toks/s, output: 18.69 toks/s] Processed prompts: 18%|█▊ | 4/22 [00:18<00:55, 3.10s/it, est. speed input: 240.95 toks/s, output: 26.42 toks/s] Processed prompts: 23%|██▎ | 5/22 [00:20<00:44, 2.64s/it, est. speed input: 274.60 toks/s, output: 34.96 toks/s] Processed prompts: 27%|██▋ | 6/22 [00:21<00:34, 2.15s/it, est. speed input: 311.54 toks/s, output: 45.13 toks/s] Processed prompts: 32%|███▏ | 7/22 [00:23<00:28, 1.92s/it, est. speed input: 340.96 toks/s, output: 55.86 toks/s] Processed prompts: 36%|███▋ | 8/22 [00:28<00:43, 3.13s/it, est. speed input: 312.39 toks/s, output: 62.66 toks/s] Processed prompts: 41%|████ | 9/22 [00:30<00:32, 2.50s/it, est. speed input: 338.66 toks/s, output: 78.92 toks/s] Processed prompts: 45%|████▌ | 10/22 [00:31<00:24, 2.04s/it, est. speed input: 363.92 toks/s, output: 95.47 toks/s] Processed prompts: 50%|█████ | 11/22 [00:34<00:26, 2.42s/it, est. speed input: 362.15 toks/s, output: 107.33 toks/s] Processed prompts: 55%|█████▍ | 12/22 [00:36<00:23, 2.31s/it, est. speed input: 372.60 toks/s, output: 123.24 toks/s] Processed prompts: 59%|█████▉ | 13/22 [00:36<00:14, 1.66s/it, est. speed input: 401.77 toks/s, output: 144.77 toks/s] Processed prompts: 64%|██████▎ | 14/22 [00:36<00:09, 1.22s/it, est. speed input: 430.54 toks/s, output: 166.27 toks/s] Processed prompts: 68%|██████▊ | 15/22 [00:37<00:07, 1.11s/it, est. speed input: 450.82 toks/s, output: 185.24 toks/s] Processed prompts: 73%|███████▎ | 16/22 [00:41<00:11, 1.86s/it, est. speed input: 438.97 toks/s, output: 193.63 toks/s] Processed prompts: 77%|███████▋ | 17/22 [00:42<00:08, 1.71s/it, est. speed input: 451.25 toks/s, output: 210.43 toks/s] Processed prompts: 82%|████████▏ | 18/22 [00:43<00:06, 1.61s/it, est. speed input: 463.05 toks/s, output: 230.25 toks/s] Processed prompts: 86%|████████▋ | 19/22 [00:44<00:03, 1.24s/it, est. speed input: 484.52 toks/s, output: 254.90 toks/s] Processed prompts: 91%|█████████ | 20/22 [00:46<00:03, 1.60s/it, est. speed input: 483.26 toks/s, output: 270.36 toks/s] Processed prompts: 95%|█████████▌| 21/22 [00:46<00:01, 1.16s/it, est. speed input: 506.22 toks/s, output: 296.62 toks/s] Processed prompts: 100%|██████████| 22/22 [00:57<00:00, 3.91s/it, est. speed input: 434.70 toks/s, output: 290.10 toks/s] Processed prompts: 100%|██████████| 22/22 [00:57<00:00, 2.60s/it, est. speed input: 434.70 toks/s, output: 290.10 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 63.74s
[3/3] 后处理...
[3/3] 后处理完成 (0.01s)
============================================================
[SUCCESS] 全部完成
总耗时: 75.22s
平均: 3.42s/页
============================================================
INFO: 127.0.0.1:41774 - "POST /ocr HTTP/1.1" 200 OK
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:11<03:34, 11.93s/it, est. speed input: 94.81 toks/s, output: 1.51 toks/s] Processed prompts: 11%|█ | 2/19 [00:14<01:53, 6.68s/it, est. speed input: 151.38 toks/s, output: 9.10 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:15<01:02, 3.91s/it, est. speed input: 218.05 toks/s, output: 17.61 toks/s] Processed prompts: 21%|██ | 4/19 [00:15<00:37, 2.51s/it, est. speed input: 284.02 toks/s, output: 26.68 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:16<00:25, 1.82s/it, est. speed input: 342.40 toks/s, output: 36.15 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:16<00:11, 1.04it/s, est. speed input: 471.66 toks/s, output: 56.78 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:17<00:04, 2.00it/s, est. speed input: 665.14 toks/s, output: 88.74 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:17<00:03, 2.29it/s, est. speed input: 724.11 toks/s, output: 99.41 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:17<00:03, 2.19it/s, est. speed input: 766.24 toks/s, output: 109.08 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:17<00:02, 2.56it/s, est. speed input: 821.68 toks/s, output: 120.99 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:18<00:01, 3.52it/s, est. speed input: 934.66 toks/s, output: 145.94 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:18<00:00, 3.72it/s, est. speed input: 985.25 toks/s, output: 158.44 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:19<00:00, 2.11it/s, est. speed input: 986.96 toks/s, output: 166.73 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:19<00:00, 2.08it/s, est. speed input: 1018.80 toks/s, output: 181.96 toks/s] Processed prompts: 100%|██████████| 19/19 [00:21<00:00, 1.18it/s, est. speed input: 985.26 toks/s, output: 195.36 toks/s] Processed prompts: 100%|██████████| 19/19 [00:21<00:00, 1.15s/it, est. speed input: 985.26 toks/s, output: 195.36 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 25.01s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 27.43s
平均: 1.44s/页
============================================================
INFO: 127.0.0.1:36528 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 18:28:05 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py:476: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr2
INFO 02-04 18:28:10 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCR2ForCausalLM']}
INFO 02-04 18:28:10 [config.py:721] This model supports multiple tasks: {'score', 'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 02-04 18:28:10 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr2', speculative_config=None, tokenizer='/home/lst/deepseek_ocr2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 18:28:11 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 18:28:11 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 18:28:11 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 18:28:11 [worker_base.py:653] ########## 48110 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 18:28:11 [worker_base.py:654] ########## 48110 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 18:28:11.724944 48110 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 18:28:11.725028 48110 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:28:11.725626 48110 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55d2880bb140, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 18:28:11.725644 48110 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:28:11.744952 48110 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55d2880bb140, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 18:28:11.744990 48110 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:28:11.746210 48110 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55d2880bb140, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 18:28:11.746228 48110 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:28:11.747244 48110 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55d2880bb140, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 18:28:11.747262 48110 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 18:28:11 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 18:28:11 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr2...
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
INFO 02-04 18:28:12 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.25it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.23it/s]
INFO 02-04 18:28:15 [loader.py:460] Loading weights took 1.72 seconds
INFO 02-04 18:28:15 [model_runner.py:1165] Model loading took 6.3336 GiB and 3.415053 seconds
Some kwargs in processor config are unused and will not have any effect: image_token, add_special_token, image_std, pad_token, normalize, downsample_ratio, ignore_id, image_mean, sft_format, patch_size, mask_prompt, candidate_resolutions.
/home/lst/DeepSeek-OCR2-vllm/deepencoderv2/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 513, in <module>
[rank0]: main()
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 504, in main
[rank0]: initialize_model(args.model_path, args.gpu_id)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 272, in initialize_model
[rank0]: llm = LLM(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
[rank0]: return engine_cls.from_vllm_config(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
[rank0]: return cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 286, in __init__
[rank0]: self._initialize_kv_caches()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 432, in _initialize_kv_caches
[rank0]: self.model_executor.determine_num_available_blocks())
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 103, in determine_num_available_blocks
[rank0]: results = self.collective_rpc("determine_num_available_blocks")
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 249, in determine_num_available_blocks
[rank0]: self.model_runner.profile_run()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1262, in profile_run
[rank0]: self._dummy_run(max_num_batched_tokens, max_num_seqs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1388, in _dummy_run
[rank0]: self.execute_model(model_input, kv_caches, intermediate_tensors)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1948, in execute_model
[rank0]: hidden_or_intermediate_states = model_executable(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 542, in forward
[rank0]: inputs_embeds = self.get_input_embeddings(input_ids,
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 521, in get_input_embeddings
[rank0]: inputs_embeds = merge_multimodal_embeddings(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 481, in merge_multimodal_embeddings
[rank0]: return _merge_multimodal_embeddings(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 397, in _merge_multimodal_embeddings
[rank0]: raise ValueError(
[rank0]: ValueError: Attempted to assign 833 + 833 + 833 + 833 + 833 + 833 + 833 + 833 + 833 = 7497 multimodal tokens to 7857 placeholders
INFO 02-04 18:29:58 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py:476: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr2
INFO 02-04 18:30:03 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCR2ForCausalLM']}
INFO 02-04 18:30:03 [config.py:721] This model supports multiple tasks: {'score', 'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 02-04 18:30:03 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr2', speculative_config=None, tokenizer='/home/lst/deepseek_ocr2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 18:30:03 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 18:30:03 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 18:30:03 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 18:30:03 [worker_base.py:653] ########## 48974 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 18:30:03 [worker_base.py:654] ########## 48974 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 18:30:03.930656 48974 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 18:30:03.930721 48974 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:30:03.931188 48974 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x555a7cdbbf30, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 18:30:03.931205 48974 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:30:03.950903 48974 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x555a7cdbbf30, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 18:30:03.950938 48974 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:30:03.952322 48974 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x555a7cdbbf30, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 18:30:03.952347 48974 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 18:30:03.953454 48974 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x555a7cdbbf30, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 18:30:03.953472 48974 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 18:30:03 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 18:30:03 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr2...
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
INFO 02-04 18:30:04 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.92it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.91it/s]
INFO 02-04 18:30:07 [loader.py:460] Loading weights took 1.83 seconds
INFO 02-04 18:30:07 [model_runner.py:1165] Model loading took 6.3336 GiB and 3.525082 seconds
Some kwargs in processor config are unused and will not have any effect: ignore_id, add_special_token, patch_size, mask_prompt, downsample_ratio, image_std, image_token, pad_token, sft_format, candidate_resolutions, normalize, image_mean.
/home/lst/DeepSeek-OCR2-vllm/deepencoderv2/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 513, in <module>
[rank0]: main()
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 504, in main
[rank0]: initialize_model(args.model_path, args.gpu_id)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2_server.py", line 272, in initialize_model
[rank0]: llm = LLM(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
[rank0]: return engine_cls.from_vllm_config(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
[rank0]: return cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 286, in __init__
[rank0]: self._initialize_kv_caches()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 432, in _initialize_kv_caches
[rank0]: self.model_executor.determine_num_available_blocks())
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 103, in determine_num_available_blocks
[rank0]: results = self.collective_rpc("determine_num_available_blocks")
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 249, in determine_num_available_blocks
[rank0]: self.model_runner.profile_run()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1262, in profile_run
[rank0]: self._dummy_run(max_num_batched_tokens, max_num_seqs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1388, in _dummy_run
[rank0]: self.execute_model(model_input, kv_caches, intermediate_tensors)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1948, in execute_model
[rank0]: hidden_or_intermediate_states = model_executable(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 541, in forward
[rank0]: vision_embeddings = self.get_multimodal_embeddings(**kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 501, in get_multimodal_embeddings
[rank0]: vision_embeddings = self._process_image_input(image_input)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 487, in _process_image_input
[rank0]: vision_features = self._pixel_values_to_embedding(
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepseek_ocr2.py", line 401, in _pixel_values_to_embedding
[rank0]: local_features_2 = self.qwen2_model(local_features_1)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR2-vllm/deepencoderv2/qwen2_d2e.py", line 264, in forward
[rank0]: batch_query_imgs = param_img.unsqueeze(0).expand(
[rank0]: UnboundLocalError: local variable 'param_img' referenced before assignment
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment