added DeepSeek OCR API by liushengtong

6ad287f7 · liuxu3 · 80c11a03 · 6ad287f7 · 6ad287f7 · 6ad287f7
Commit 6ad287f7 authored Feb 28, 2026 by liuxu3
20 changed files
--- a/DeepSeek-OCR-vllm/doc/test.png
+++ b/DeepSeek-OCR-vllm/doc/test.png
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_114631.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_114631.log
+INFO 02-04 11:46:35 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 11:46:40 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 11:46:40 [config.py:721] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
+INFO 02-04 11:46:40 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 11:46:41 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 11:46:41 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 11:46:41 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 11:46:41 [worker_base.py:653] ########## 2639 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 11:46:41 [worker_base.py:654] ########## 2639 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 11:46:41.823388  2639 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 11:46:41.823468  2639 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:46:41.824054  2639 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 11:46:41.824072  2639 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:46:41.843951  2639 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 11:46:41.843997  2639 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:46:41.845223  2639 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 11:46:41.845247  2639 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:46:41.846169  2639 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 11:46:41.846191  2639 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 11:46:41 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 11:46:41 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 11:46:42 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.02it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.00it/s]
+INFO 02-04 11:46:45 [loader.py:460] Loading weights took 2.10 seconds
+INFO 02-04 11:46:45 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.525227 seconds
+Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, downsample_ratio, patch_size, ignore_id, mask_prompt, normalize, pad_token, add_special_token, image_std, image_mean, sft_format, image_token. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 11:46:58 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 11:46:58 [worker.py:287] Memory profiling takes 12.97 seconds
+INFO 02-04 11:46:58 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.50) = 31.99GiB
+INFO 02-04 11:46:58 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 22.76GiB.
+INFO 02-04 11:46:58 [executor_base.py:112] # rocm blocks: 1553, # CPU blocks: 273
+INFO 02-04 11:46:58 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 121.17x
+INFO 02-04 11:47:01 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:03,  1.29it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:02,  1.51it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:02<00:02,  1.49it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.49it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:03<00:00,  1.50it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.59it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.53it/s]
+INFO 02-04 11:47:04 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
+INFO 02-04 11:47:04 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.41 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 1 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [2639]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, downsample_ratio, patch_size, ignore_id, mask_prompt, normalize, pad_token, add_special_token, image_std, image_mean, sft_format, image_token. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:26,  8.13s/it, est. speed input: 112.34 toks/s, output: 1.35 toks/s]
Processed prompts:  11%|█         | 2/19 [00:08<01:00,  3.53s/it, est. speed input: 216.37 toks/s, output: 4.27 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:08<00:34,  2.13s/it, est. speed input: 307.95 toks/s, output: 9.11 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:09<00:16,  1.21s/it, est. speed input: 460.07 toks/s, output: 23.18 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:10<00:08,  1.43it/s, est. speed input: 635.34 toks/s, output: 44.04 toks/s]
Processed prompts:  42%|████▏     | 8/19 [00:10<00:06,  1.63it/s, est. speed input: 702.70 toks/s, output: 55.03 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:10<00:05,  1.88it/s, est. speed input: 768.38 toks/s, output: 67.14 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:10<00:04,  2.17it/s, est. speed input: 833.25 toks/s, output: 80.22 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:11<00:03,  2.61it/s, est. speed input: 901.61 toks/s, output: 94.26 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:11<00:02,  2.96it/s, est. speed input: 964.32 toks/s, output: 108.70 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:12<00:02,  2.30it/s, est. speed input: 986.28 toks/s, output: 121.65 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:12<00:01,  2.81it/s, est. speed input: 1047.64 toks/s, output: 139.66 toks/s]
Processed prompts:  79%|███████▉  | 15/19 [00:12<00:01,  3.10it/s, est. speed input: 1100.53 toks/s, output: 157.59 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:12<00:01,  2.82it/s, est. speed input: 1134.68 toks/s, output: 175.08 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:02,  1.43s/it, est. speed input: 921.20 toks/s, output: 169.57 toks/s] 
Processed prompts:  95%|█████████▍| 18/19 [00:20<00:01,  1.98s/it, est. speed input: 816.43 toks/s, output: 187.84 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:22<00:00,  2.17s/it, est. speed input: 762.83 toks/s, output: 220.97 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:22<00:00,  1.20s/it, est. speed input: 762.83 toks/s, output: 220.97 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 29.08s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 31.52s
+   平均: 1.66s/页
+============================================================
+INFO:     127.0.0.1:35920 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_115919.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_115919.log
+INFO 02-04 11:59:24 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 11:59:29 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 11:59:29 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
+INFO 02-04 11:59:29 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 11:59:29 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 11:59:29 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 11:59:29 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 11:59:29 [worker_base.py:653] ########## 3602 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 11:59:29 [worker_base.py:654] ########## 3602 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 11:59:29.986018  3602 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 11:59:29.986095  3602 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:59:29.986546  3602 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 11:59:29.986562  3602 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:59:30.005985  3602 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 11:59:30.006026  3602 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:59:30.007256  3602 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 11:59:30.007274  3602 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 11:59:30.008291  3602 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 11:59:30.008311  3602 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 11:59:30 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 11:59:30 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 11:59:30 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.07it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.05it/s]
+INFO 02-04 11:59:33 [loader.py:460] Loading weights took 2.07 seconds
+INFO 02-04 11:59:33 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.502876 seconds
+Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_token, ignore_id, patch_size, candidate_resolutions, sft_format, pad_token, image_mean, normalize, add_special_token, mask_prompt, image_std. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 11:59:47 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 11:59:48 [worker.py:287] Memory profiling takes 14.62 seconds
+INFO 02-04 11:59:48 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
+INFO 02-04 11:59:48 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
+INFO 02-04 11:59:48 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
+INFO 02-04 11:59:48 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
+INFO 02-04 11:59:50 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:02,  1.83it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:01,  2.00it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:01<00:01,  2.05it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:01<00:00,  2.02it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:02<00:00,  2.04it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00,  2.08it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00,  2.04it/s]
+INFO 02-04 11:59:53 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
+INFO 02-04 11:59:53 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 20.09 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 1 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [3602]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_token, ignore_id, patch_size, candidate_resolutions, sft_format, pad_token, image_mean, normalize, add_special_token, mask_prompt, image_std. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:27,  8.19s/it, est. speed input: 111.49 toks/s, output: 2.20 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:20,  4.76s/it, est. speed input: 173.20 toks/s, output: 10.91 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:46,  2.88s/it, est. speed input: 244.92 toks/s, output: 20.83 toks/s]
Processed prompts:  21%|██        | 4/19 [00:12<00:31,  2.11s/it, est. speed input: 301.48 toks/s, output: 31.70 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.44s/it, est. speed input: 369.13 toks/s, output: 43.99 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:13,  1.05s/it, est. speed input: 432.44 toks/s, output: 56.44 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.30it/s, est. speed input: 497.05 toks/s, output: 69.45 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:12<00:04,  2.36it/s, est. speed input: 633.98 toks/s, output: 96.91 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.66it/s, est. speed input: 691.77 toks/s, output: 109.71 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:13<00:03,  2.07it/s, est. speed input: 718.61 toks/s, output: 119.85 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:14<00:03,  2.27it/s, est. speed input: 765.90 toks/s, output: 134.08 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.85it/s, est. speed input: 822.49 toks/s, output: 150.17 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.15it/s, est. speed input: 871.73 toks/s, output: 165.59 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  2.26it/s, est. speed input: 921.46 toks/s, output: 191.32 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:00,  2.15it/s, est. speed input: 947.29 toks/s, output: 208.12 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:04,  4.83s/it, est. speed input: 486.93 toks/s, output: 161.72 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.78s/it, est. speed input: 513.98 toks/s, output: 222.40 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 39.72s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 42.18s
+   平均: 2.22s/页
+============================================================
+INFO:     127.0.0.1:48266 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_131516.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_131516.log
+INFO 02-04 13:15:20 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:15:25 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:15:25 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
+INFO 02-04 13:15:25 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False, 
+INFO 02-04 13:15:26 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:15:26 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:15:26 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:15:26 [worker_base.py:653] ########## 4593 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:15:26 [worker_base.py:654] ########## 4593 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:15:26.794960  4593 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:15:26.795035  4593 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:15:26.795487  4593 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:15:26.795502  4593 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:15:26.814953  4593 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:15:26.814989  4593 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:15:26.816212  4593 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:15:26.816231  4593 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:15:26.817245  4593 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:15:26.817265  4593 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:15:26 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:15:26 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:15:27 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.84it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.83it/s]
+INFO 02-04 13:15:30 [loader.py:460] Loading weights took 2.13 seconds
+INFO 02-04 13:15:30 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.562644 seconds
+Some kwargs in processor config are unused and will not have any effect: ignore_id, pad_token, image_token, add_special_token, candidate_resolutions, mask_prompt, image_std, sft_format, patch_size, normalize, image_mean, downsample_ratio. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:15:44 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:15:45 [worker.py:287] Memory profiling takes 14.64 seconds
+INFO 02-04 13:15:45 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
+INFO 02-04 13:15:45 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
+INFO 02-04 13:15:45 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
+INFO 02-04 13:15:45 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
+INFO 02-04 13:15:47 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/19 [00:00<?, ?it/s]
Capturing CUDA graph shapes:   5%|▌         | 1/19 [00:00<00:08,  2.02it/s]
Capturing CUDA graph shapes:  11%|█         | 2/19 [00:00<00:08,  2.09it/s]
Capturing CUDA graph shapes:  16%|█▌        | 3/19 [00:01<00:07,  2.11it/s]
Capturing CUDA graph shapes:  21%|██        | 4/19 [00:01<00:07,  2.06it/s]
Capturing CUDA graph shapes:  26%|██▋       | 5/19 [00:02<00:06,  2.06it/s]
Capturing CUDA graph shapes:  32%|███▏      | 6/19 [00:02<00:06,  2.09it/s]
Capturing CUDA graph shapes:  37%|███▋      | 7/19 [00:03<00:05,  2.07it/s]
Capturing CUDA graph shapes:  42%|████▏     | 8/19 [00:03<00:05,  2.05it/s]
Capturing CUDA graph shapes:  47%|████▋     | 9/19 [00:04<00:04,  2.04it/s]
Capturing CUDA graph shapes:  53%|█████▎    | 10/19 [00:04<00:04,  2.07it/s]
Capturing CUDA graph shapes:  58%|█████▊    | 11/19 [00:05<00:03,  2.08it/s]
Capturing CUDA graph shapes:  63%|██████▎   | 12/19 [00:05<00:03,  2.05it/s]
Capturing CUDA graph shapes:  68%|██████▊   | 13/19 [00:06<00:02,  2.06it/s]
Capturing CUDA graph shapes:  74%|███████▎  | 14/19 [00:06<00:02,  2.08it/s]
Capturing CUDA graph shapes:  79%|███████▉  | 15/19 [00:07<00:01,  2.10it/s]
Capturing CUDA graph shapes:  84%|████████▍ | 16/19 [00:07<00:01,  2.11it/s]
Capturing CUDA graph shapes:  89%|████████▉ | 17/19 [00:08<00:00,  2.08it/s]
Capturing CUDA graph shapes:  95%|█████████▍| 18/19 [00:08<00:00,  2.08it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.10it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.08it/s]
+INFO 02-04 13:15:56 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
+INFO 02-04 13:15:56 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 26.33 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 2 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [4593]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: ignore_id, pad_token, image_token, add_special_token, candidate_resolutions, mask_prompt, image_std, sft_format, patch_size, normalize, image_mean, downsample_ratio. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:28,  8.24s/it, est. speed input: 110.87 toks/s, output: 2.19 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:21,  4.78s/it, est. speed input: 172.44 toks/s, output: 10.86 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:46,  2.89s/it, est. speed input: 243.95 toks/s, output: 20.75 toks/s]
Processed prompts:  21%|██        | 4/19 [00:12<00:31,  2.12s/it, est. speed input: 300.33 toks/s, output: 31.58 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.44s/it, est. speed input: 367.77 toks/s, output: 43.83 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:13,  1.05s/it, est. speed input: 430.89 toks/s, output: 56.24 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.30it/s, est. speed input: 495.35 toks/s, output: 69.21 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:13<00:04,  2.36it/s, est. speed input: 631.80 toks/s, output: 96.57 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.66it/s, est. speed input: 689.56 toks/s, output: 109.36 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:14<00:03,  2.06it/s, est. speed input: 716.25 toks/s, output: 119.46 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:14<00:03,  2.26it/s, est. speed input: 763.32 toks/s, output: 133.63 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.84it/s, est. speed input: 819.68 toks/s, output: 149.65 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.15it/s, est. speed input: 868.97 toks/s, output: 165.06 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  2.25it/s, est. speed input: 918.38 toks/s, output: 190.68 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:00,  2.15it/s, est. speed input: 944.32 toks/s, output: 207.47 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:04,  4.81s/it, est. speed input: 487.69 toks/s, output: 161.97 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.77s/it, est. speed input: 514.78 toks/s, output: 222.74 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 38.37s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 40.82s
+   平均: 2.15s/页
+============================================================
+INFO:     127.0.0.1:55486 - "POST /ocr HTTP/1.1" 200 OK
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:05<01:40,  5.58s/it, est. speed input: 163.69 toks/s, output: 3.23 toks/s]
Processed prompts:  11%|█         | 2/19 [00:07<01:02,  3.68s/it, est. speed input: 230.47 toks/s, output: 14.51 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:08<00:36,  2.29s/it, est. speed input: 319.52 toks/s, output: 27.18 toks/s]
Processed prompts:  21%|██        | 4/19 [00:09<00:26,  1.76s/it, est. speed input: 384.30 toks/s, output: 40.41 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:09<00:17,  1.21s/it, est. speed input: 467.80 toks/s, output: 55.75 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:10<00:11,  1.11it/s, est. speed input: 544.48 toks/s, output: 71.07 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:10<00:08,  1.49it/s, est. speed input: 623.53 toks/s, output: 87.12 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:10<00:03,  2.69it/s, est. speed input: 793.73 toks/s, output: 121.32 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:10<00:03,  2.97it/s, est. speed input: 862.33 toks/s, output: 136.76 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:11<00:03,  2.19it/s, est. speed input: 883.22 toks/s, output: 147.31 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:11<00:02,  2.37it/s, est. speed input: 935.98 toks/s, output: 163.86 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:11<00:02,  2.95it/s, est. speed input: 1003.15 toks/s, output: 183.15 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:12<00:01,  3.26it/s, est. speed input: 1059.75 toks/s, output: 201.30 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:13<00:01,  2.29it/s, est. speed input: 1102.22 toks/s, output: 228.85 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:13<00:00,  2.18it/s, est. speed input: 1126.22 toks/s, output: 247.43 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:31<00:04,  4.81s/it, est. speed input: 528.62 toks/s, output: 175.56 toks/s] 
Processed prompts: 100%|██████████| 19/19 [00:31<00:00,  1.64s/it, est. speed input: 557.98 toks/s, output: 241.44 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 33.96s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 36.41s
+   平均: 1.92s/页
+============================================================
+INFO:     127.0.0.1:35584 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_131946.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_131946.log
+INFO 02-04 13:19:50 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:19:55 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:19:55 [config.py:721] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
+INFO 02-04 13:19:55 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False, 
+INFO 02-04 13:19:56 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:19:56 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:19:56 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:19:56 [worker_base.py:653] ########## 5553 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:19:56 [worker_base.py:654] ########## 5553 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:19:56.829571  5553 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:19:56.829648  5553 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:19:56.830089  5553 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:19:56.830101  5553 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:19:56.849946  5553 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:19:56.849987  5553 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:19:56.851194  5553 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:19:56.851212  5553 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:19:56.852396  5553 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:19:56.852418  5553 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:19:56 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:19:56 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:19:57 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.09it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.07it/s]
+INFO 02-04 13:20:00 [loader.py:460] Loading weights took 2.10 seconds
+INFO 02-04 13:20:00 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.520996 seconds
+Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, image_std, image_token, pad_token, image_mean, candidate_resolutions, sft_format, downsample_ratio, ignore_id, patch_size, normalize. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:20:14 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:20:15 [worker.py:287] Memory profiling takes 14.27 seconds
+INFO 02-04 13:20:15 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
+INFO 02-04 13:20:15 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
+INFO 02-04 13:20:15 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
+INFO 02-04 13:20:15 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
+INFO 02-04 13:20:17 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/19 [00:00<?, ?it/s]
Capturing CUDA graph shapes:   5%|▌         | 1/19 [00:00<00:08,  2.03it/s]
Capturing CUDA graph shapes:  11%|█         | 2/19 [00:00<00:08,  2.10it/s]
Capturing CUDA graph shapes:  16%|█▌        | 3/19 [00:01<00:07,  2.12it/s]
Capturing CUDA graph shapes:  21%|██        | 4/19 [00:01<00:07,  2.13it/s]
Capturing CUDA graph shapes:  26%|██▋       | 5/19 [00:02<00:06,  2.11it/s]
Capturing CUDA graph shapes:  32%|███▏      | 6/19 [00:02<00:06,  2.12it/s]
Capturing CUDA graph shapes:  37%|███▋      | 7/19 [00:03<00:05,  2.13it/s]
Capturing CUDA graph shapes:  42%|████▏     | 8/19 [00:03<00:05,  2.10it/s]
Capturing CUDA graph shapes:  47%|████▋     | 9/19 [00:04<00:04,  2.08it/s]
Capturing CUDA graph shapes:  53%|█████▎    | 10/19 [00:04<00:04,  2.09it/s]
Capturing CUDA graph shapes:  58%|█████▊    | 11/19 [00:05<00:03,  2.11it/s]
Capturing CUDA graph shapes:  63%|██████▎   | 12/19 [00:05<00:03,  2.11it/s]
Capturing CUDA graph shapes:  68%|██████▊   | 13/19 [00:06<00:02,  2.12it/s]
Capturing CUDA graph shapes:  74%|███████▎  | 14/19 [00:06<00:02,  2.12it/s]
Capturing CUDA graph shapes:  79%|███████▉  | 15/19 [00:07<00:01,  2.12it/s]
Capturing CUDA graph shapes:  84%|████████▍ | 16/19 [00:07<00:01,  2.12it/s]
Capturing CUDA graph shapes:  89%|████████▉ | 17/19 [00:08<00:00,  2.08it/s]
Capturing CUDA graph shapes:  95%|█████████▍| 18/19 [00:08<00:00,  2.08it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.04it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.09it/s]
+INFO 02-04 13:20:26 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
+INFO 02-04 13:20:26 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 25.91 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 1 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [5553]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, image_std, image_token, pad_token, image_mean, candidate_resolutions, sft_format, downsample_ratio, ignore_id, patch_size, normalize. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:27,  8.19s/it, est. speed input: 111.42 toks/s, output: 2.20 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:20,  4.75s/it, est. speed input: 173.40 toks/s, output: 10.92 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:45,  2.87s/it, est. speed input: 245.20 toks/s, output: 20.86 toks/s]
Processed prompts:  21%|██        | 4/19 [00:12<00:31,  2.10s/it, est. speed input: 301.95 toks/s, output: 31.75 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.44s/it, est. speed input: 369.65 toks/s, output: 44.05 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:13,  1.05s/it, est. speed input: 433.11 toks/s, output: 56.53 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.30it/s, est. speed input: 497.86 toks/s, output: 69.56 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:12<00:04,  2.37it/s, est. speed input: 635.00 toks/s, output: 97.06 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.68it/s, est. speed input: 693.07 toks/s, output: 109.92 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:13<00:03,  2.07it/s, est. speed input: 719.87 toks/s, output: 120.06 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:14<00:03,  2.27it/s, est. speed input: 767.21 toks/s, output: 134.31 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.85it/s, est. speed input: 823.86 toks/s, output: 150.42 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.17it/s, est. speed input: 873.34 toks/s, output: 165.90 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  2.26it/s, est. speed input: 922.66 toks/s, output: 191.57 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:00,  2.15it/s, est. speed input: 948.51 toks/s, output: 208.39 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:04,  4.85s/it, est. speed input: 486.19 toks/s, output: 161.47 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.78s/it, est. speed input: 513.20 toks/s, output: 222.06 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 40.02s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 42.47s
+   平均: 2.24s/页
+============================================================
+INFO:     127.0.0.1:54172 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_132225.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_132225.log
+INFO 02-04 13:22:29 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:22:34 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:22:34 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate', 'score'}. Defaulting to 'generate'.
+INFO 02-04 13:22:34 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False, 
+INFO 02-04 13:22:35 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:22:35 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:22:35 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:22:35 [worker_base.py:653] ########## 6517 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:22:35 [worker_base.py:654] ########## 6517 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:22:35.820861  6517 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:22:35.820928  6517 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:22:35.821431  6517 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:22:35.821447  6517 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:22:35.840910  6517 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:22:35.840948  6517 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:22:35.842164  6517 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:22:35.842182  6517 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:22:35.843191  6517 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:22:35.843209  6517 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:22:35 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:22:35 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:22:36 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.01it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.99it/s]
+INFO 02-04 13:22:39 [loader.py:460] Loading weights took 1.97 seconds
+INFO 02-04 13:22:39 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.414745 seconds
+Some kwargs in processor config are unused and will not have any effect: image_mean, sft_format, add_special_token, downsample_ratio, image_token, pad_token, ignore_id, patch_size, candidate_resolutions, mask_prompt, normalize, image_std. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:22:53 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:22:54 [worker.py:287] Memory profiling takes 14.41 seconds
+INFO 02-04 13:22:54 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
+INFO 02-04 13:22:54 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
+INFO 02-04 13:22:54 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
+INFO 02-04 13:22:54 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
+INFO 02-04 13:22:56 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/19 [00:00<?, ?it/s]
Capturing CUDA graph shapes:   5%|▌         | 1/19 [00:00<00:09,  1.87it/s]
Capturing CUDA graph shapes:  11%|█         | 2/19 [00:01<00:08,  1.94it/s]
Capturing CUDA graph shapes:  16%|█▌        | 3/19 [00:01<00:08,  1.99it/s]
Capturing CUDA graph shapes:  21%|██        | 4/19 [00:02<00:07,  1.99it/s]
Capturing CUDA graph shapes:  26%|██▋       | 5/19 [00:02<00:06,  2.02it/s]
Capturing CUDA graph shapes:  32%|███▏      | 6/19 [00:02<00:06,  2.06it/s]
Capturing CUDA graph shapes:  37%|███▋      | 7/19 [00:03<00:05,  2.08it/s]
Capturing CUDA graph shapes:  42%|████▏     | 8/19 [00:03<00:05,  2.10it/s]
Capturing CUDA graph shapes:  47%|████▋     | 9/19 [00:04<00:04,  2.04it/s]
Capturing CUDA graph shapes:  53%|█████▎    | 10/19 [00:04<00:04,  2.05it/s]
Capturing CUDA graph shapes:  58%|█████▊    | 11/19 [00:05<00:03,  2.08it/s]
Capturing CUDA graph shapes:  63%|██████▎   | 12/19 [00:05<00:03,  2.10it/s]
Capturing CUDA graph shapes:  68%|██████▊   | 13/19 [00:06<00:02,  2.11it/s]
Capturing CUDA graph shapes:  74%|███████▎  | 14/19 [00:06<00:02,  2.12it/s]
Capturing CUDA graph shapes:  79%|███████▉  | 15/19 [00:07<00:01,  2.13it/s]
Capturing CUDA graph shapes:  84%|████████▍ | 16/19 [00:07<00:01,  2.13it/s]
Capturing CUDA graph shapes:  89%|████████▉ | 17/19 [00:08<00:00,  2.10it/s]
Capturing CUDA graph shapes:  95%|█████████▍| 18/19 [00:08<00:00,  2.05it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.07it/s]
Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00,  2.07it/s]
+INFO 02-04 13:23:05 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
+INFO 02-04 13:23:05 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 26.07 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 16 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [6517]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: image_mean, sft_format, add_special_token, downsample_ratio, image_token, pad_token, ignore_id, patch_size, candidate_resolutions, mask_prompt, normalize, image_std. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:26,  8.12s/it, est. speed input: 112.38 toks/s, output: 2.22 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:19,  4.70s/it, est. speed input: 175.00 toks/s, output: 11.02 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:45,  2.85s/it, est. speed input: 247.35 toks/s, output: 21.04 toks/s]
Processed prompts:  21%|██        | 4/19 [00:12<00:31,  2.09s/it, est. speed input: 304.23 toks/s, output: 31.99 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:19,  1.43s/it, est. speed input: 372.44 toks/s, output: 44.38 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:13,  1.04s/it, est. speed input: 436.28 toks/s, output: 56.94 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.31it/s, est. speed input: 501.41 toks/s, output: 70.06 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:12<00:04,  2.38it/s, est. speed input: 639.49 toks/s, output: 97.75 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.68it/s, est. speed input: 697.83 toks/s, output: 110.67 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:13<00:03,  2.08it/s, est. speed input: 724.43 toks/s, output: 120.82 toks/s]
Processed prompts:  63%|██████▎   | 12/19 [00:14<00:03,  2.27it/s, est. speed input: 771.91 toks/s, output: 135.13 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.85it/s, est. speed input: 828.84 toks/s, output: 151.33 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.16it/s, est. speed input: 878.49 toks/s, output: 166.87 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  2.25it/s, est. speed input: 927.65 toks/s, output: 192.60 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:00,  2.15it/s, est. speed input: 953.21 toks/s, output: 209.42 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:04,  4.81s/it, est. speed input: 489.86 toks/s, output: 162.69 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.77s/it, est. speed input: 517.07 toks/s, output: 223.73 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 37.41s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 39.84s
+   平均: 2.10s/页
+============================================================
+INFO:     127.0.0.1:46140 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_132709.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_132709.log
+INFO 02-04 13:27:14 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:27:19 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:27:19 [config.py:721] This model supports multiple tasks: {'classify', 'embed', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
+INFO 02-04 13:27:19 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 13:27:19 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:27:19 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:27:19 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:27:19 [worker_base.py:653] ########## 7962 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:27:19 [worker_base.py:654] ########## 7962 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:27:19.934458  7962 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:27:19.934533  7962 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:27:19.934978  7962 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:27:19.934991  7962 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:27:19.954949  7962 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:27:19.954993  7962 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:27:19.956467  7962 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:27:19.956496  7962 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:27:19.957623  7962 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:27:19.957641  7962 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:27:19 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:27:19 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:27:20 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.01it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.99it/s]
+INFO 02-04 13:27:23 [loader.py:460] Loading weights took 2.11 seconds
+INFO 02-04 13:27:23 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.541007 seconds
+Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, mask_prompt, pad_token, sft_format, image_std, image_mean, add_special_token, normalize, downsample_ratio, patch_size, ignore_id, image_token. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:27:36 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:27:36 [worker.py:287] Memory profiling takes 13.04 seconds
+INFO 02-04 13:27:36 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.50) = 31.99GiB
+INFO 02-04 13:27:36 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 22.76GiB.
+INFO 02-04 13:27:37 [executor_base.py:112] # rocm blocks: 6214, # CPU blocks: 1092
+INFO 02-04 13:27:37 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 121.21x
+INFO 02-04 13:27:39 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:03,  1.33it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:02,  1.53it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:01<00:01,  1.61it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.62it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:03<00:00,  1.64it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.66it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.62it/s]
+INFO 02-04 13:27:42 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
+INFO 02-04 13:27:42 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.26 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 16 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [7962]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, mask_prompt, pad_token, sft_format, image_std, image_mean, add_special_token, normalize, downsample_ratio, patch_size, ignore_id, image_token. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:27,  8.17s/it, est. speed input: 111.78 toks/s, output: 2.20 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:20,  4.73s/it, est. speed input: 174.14 toks/s, output: 10.97 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:45,  2.86s/it, est. speed input: 246.20 toks/s, output: 20.94 toks/s]
Processed prompts:  21%|██        | 4/19 [00:11<00:28,  1.89s/it, est. speed input: 317.05 toks/s, output: 31.69 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.49s/it, est. speed input: 370.97 toks/s, output: 42.66 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:14,  1.09s/it, est. speed input: 434.44 toks/s, output: 55.20 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.26it/s, est. speed input: 499.34 toks/s, output: 68.29 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:12<00:04,  2.30it/s, est. speed input: 636.82 toks/s, output: 95.87 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.60it/s, est. speed input: 694.88 toks/s, output: 108.76 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:13<00:03,  2.03it/s, est. speed input: 721.22 toks/s, output: 118.92 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.86it/s, est. speed input: 832.75 toks/s, output: 149.44 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.39it/s, est. speed input: 889.48 toks/s, output: 165.55 toks/s]
Processed prompts:  79%|███████▉  | 15/19 [00:14<00:01,  3.64it/s, est. speed input: 938.97 toks/s, output: 181.01 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  1.94it/s, est. speed input: 926.24 toks/s, output: 189.01 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:01,  1.93it/s, est. speed input: 952.48 toks/s, output: 206.07 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:05,  5.30s/it, est. speed input: 486.32 toks/s, output: 159.97 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.78s/it, est. speed input: 513.33 toks/s, output: 220.58 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 37.73s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 40.18s
+   平均: 2.11s/页
+============================================================
+INFO:     127.0.0.1:36984 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_133445.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_133445.log
+INFO 02-04 13:34:49 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:34:54 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:34:54 [config.py:721] This model supports multiple tasks: {'embed', 'score', 'classify', 'reward', 'generate'}. Defaulting to 'generate'.
+INFO 02-04 13:34:54 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 13:34:55 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:34:55 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:34:55 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:34:55 [worker_base.py:653] ########## 9384 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:34:55 [worker_base.py:654] ########## 9384 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:34:55.404677  9384 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:34:55.404740  9384 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:34:55.405254  9384 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:34:55.405270  9384 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:34:55.424903  9384 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:34:55.424939  9384 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:34:55.426165  9384 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:34:55.426184  9384 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:34:55.427194  9384 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:34:55.427210  9384 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:34:55 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:34:55 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:34:56 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.02it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.00it/s]
+INFO 02-04 13:34:58 [loader.py:460] Loading weights took 1.98 seconds
+INFO 02-04 13:34:59 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.419236 seconds
+Some kwargs in processor config are unused and will not have any effect: add_special_token, downsample_ratio, candidate_resolutions, mask_prompt, normalize, pad_token, image_std, image_token, sft_format, image_mean, patch_size, ignore_id. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:35:11 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:35:12 [worker.py:287] Memory profiling takes 12.98 seconds
+INFO 02-04 13:35:12 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
+INFO 02-04 13:35:12 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 48.35GiB.
+INFO 02-04 13:35:12 [executor_base.py:112] # rocm blocks: 13203, # CPU blocks: 1092
+INFO 02-04 13:35:12 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 257.54x
+INFO 02-04 13:35:14 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:02,  1.90it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:02,  1.97it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:01<00:01,  2.00it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.97it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:02<00:00,  1.98it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  2.03it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  2.00it/s]
+INFO 02-04 13:35:17 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
+INFO 02-04 13:35:17 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 18.56 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 8 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [9384]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: add_special_token, downsample_ratio, candidate_resolutions, mask_prompt, normalize, pad_token, image_std, image_token, sft_format, image_mean, patch_size, ignore_id. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:27,  8.19s/it, est. speed input: 111.54 toks/s, output: 2.20 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:20,  4.75s/it, est. speed input: 173.48 toks/s, output: 10.93 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:45,  2.87s/it, est. speed input: 245.39 toks/s, output: 20.87 toks/s]
Processed prompts:  21%|██        | 4/19 [00:11<00:28,  1.89s/it, est. speed input: 316.08 toks/s, output: 31.59 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.49s/it, est. speed input: 370.15 toks/s, output: 42.57 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:14,  1.09s/it, est. speed input: 433.69 toks/s, output: 55.10 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.26it/s, est. speed input: 498.53 toks/s, output: 68.18 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:12<00:04,  2.30it/s, est. speed input: 635.85 toks/s, output: 95.72 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.61it/s, est. speed input: 693.86 toks/s, output: 108.60 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:13<00:03,  2.05it/s, est. speed input: 720.77 toks/s, output: 118.85 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.89it/s, est. speed input: 832.55 toks/s, output: 149.41 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.42it/s, est. speed input: 889.36 toks/s, output: 165.53 toks/s]
Processed prompts:  79%|███████▉  | 15/19 [00:14<00:01,  3.67it/s, est. speed input: 938.84 toks/s, output: 180.98 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  1.95it/s, est. speed input: 926.49 toks/s, output: 189.07 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:01,  1.94it/s, est. speed input: 952.75 toks/s, output: 206.13 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:05,  5.32s/it, est. speed input: 485.27 toks/s, output: 159.63 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.78s/it, est. speed input: 512.22 toks/s, output: 220.10 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 37.72s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 40.16s
+   平均: 2.11s/页
+============================================================
+INFO:     127.0.0.1:33914 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_133800.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_133800.log
+INFO 02-04 13:38:05 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 13:38:10 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 13:38:10 [config.py:721] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.
+INFO 02-04 13:38:10 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 13:38:10 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 13:38:10 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 13:38:10 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 13:38:10 [worker_base.py:653] ########## 11311 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 13:38:10 [worker_base.py:654] ########## 11311 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 13:38:11.160339 11311 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 13:38:11.160403 11311 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:38:11.160856 11311 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 13:38:11.160869 11311 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:38:11.180904 11311 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 13:38:11.180943 11311 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:38:11.182142 11311 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 13:38:11.182159 11311 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 13:38:11.183102 11311 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 13:38:11.183123 11311 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 13:38:11 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 13:38:11 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 13:38:11 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.14it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.12it/s]
+INFO 02-04 13:38:14 [loader.py:460] Loading weights took 1.97 seconds
+INFO 02-04 13:38:14 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.406699 seconds
+Some kwargs in processor config are unused and will not have any effect: downsample_ratio, mask_prompt, add_special_token, ignore_id, normalize, image_std, image_token, patch_size, image_mean, sft_format, pad_token, candidate_resolutions. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 13:38:27 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 13:38:28 [worker.py:287] Memory profiling takes 13.04 seconds
+INFO 02-04 13:38:28 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
+INFO 02-04 13:38:28 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 48.35GiB.
+INFO 02-04 13:38:28 [executor_base.py:112] # rocm blocks: 13203, # CPU blocks: 1092
+INFO 02-04 13:38:28 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 257.54x
+INFO 02-04 13:38:30 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:03,  1.37it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:02,  1.69it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:01<00:01,  1.81it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.90it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:02<00:00,  1.90it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.97it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.87it/s]
+INFO 02-04 13:38:33 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
+INFO 02-04 13:38:33 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 18.84 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 2 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [11311]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: downsample_ratio, mask_prompt, add_special_token, ignore_id, normalize, image_std, image_token, patch_size, image_mean, sft_format, pad_token, candidate_resolutions. 
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:08<02:28,  8.28s/it, est. speed input: 110.31 toks/s, output: 2.17 toks/s]
Processed prompts:  11%|█         | 2/19 [00:10<01:21,  4.79s/it, est. speed input: 171.92 toks/s, output: 10.83 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:11<00:46,  2.89s/it, est. speed input: 243.26 toks/s, output: 20.69 toks/s]
Processed prompts:  21%|██        | 4/19 [00:11<00:28,  1.91s/it, est. speed input: 313.42 toks/s, output: 31.32 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:12<00:20,  1.50s/it, est. speed input: 367.24 toks/s, output: 42.23 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:12<00:14,  1.09s/it, est. speed input: 430.34 toks/s, output: 54.68 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:12<00:09,  1.25it/s, est. speed input: 494.64 toks/s, output: 67.64 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:13<00:04,  2.29it/s, est. speed input: 630.90 toks/s, output: 94.98 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:13<00:03,  2.60it/s, est. speed input: 688.67 toks/s, output: 107.79 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:14<00:03,  2.04it/s, est. speed input: 715.58 toks/s, output: 117.99 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:14<00:02,  2.88it/s, est. speed input: 826.58 toks/s, output: 148.34 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:14<00:01,  3.41it/s, est. speed input: 883.01 toks/s, output: 164.35 toks/s]
Processed prompts:  79%|███████▉  | 15/19 [00:14<00:01,  3.67it/s, est. speed input: 932.42 toks/s, output: 179.74 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:15<00:01,  1.95it/s, est. speed input: 920.74 toks/s, output: 187.89 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:16<00:01,  1.94it/s, est. speed input: 947.26 toks/s, output: 204.94 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:33<00:05,  5.27s/it, est. speed input: 486.46 toks/s, output: 160.02 toks/s]
Processed prompts: 100%|██████████| 19/19 [00:33<00:00,  1.78s/it, est. speed input: 513.48 toks/s, output: 220.64 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 38.49s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 40.93s
+   平均: 2.15s/页
+============================================================
+INFO:     127.0.0.1:56690 - "POST /ocr HTTP/1.1" 200 OK
+   [1/3] Tokenize 19 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 19 页...
+
Processed prompts:   0%|          | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▌         | 1/19 [00:05<01:41,  5.62s/it, est. speed input: 162.32 toks/s, output: 3.20 toks/s]
Processed prompts:  11%|█         | 2/19 [00:07<01:02,  3.70s/it, est. speed input: 228.68 toks/s, output: 14.40 toks/s]
Processed prompts:  16%|█▌        | 3/19 [00:08<00:36,  2.31s/it, est. speed input: 317.10 toks/s, output: 26.97 toks/s]
Processed prompts:  21%|██        | 4/19 [00:09<00:23,  1.55s/it, est. speed input: 404.40 toks/s, output: 40.42 toks/s]
Processed prompts:  26%|██▋       | 5/19 [00:09<00:17,  1.28s/it, est. speed input: 464.89 toks/s, output: 53.46 toks/s]
Processed prompts:  32%|███▏      | 6/19 [00:10<00:12,  1.06it/s, est. speed input: 541.38 toks/s, output: 68.78 toks/s]
Processed prompts:  37%|███▋      | 7/19 [00:10<00:08,  1.43it/s, est. speed input: 620.10 toks/s, output: 84.80 toks/s]
Processed prompts:  47%|████▋     | 9/19 [00:10<00:03,  2.59it/s, est. speed input: 789.32 toks/s, output: 118.82 toks/s]
Processed prompts:  53%|█████▎    | 10/19 [00:10<00:03,  2.88it/s, est. speed input: 857.64 toks/s, output: 134.23 toks/s]
Processed prompts:  58%|█████▊    | 11/19 [00:11<00:03,  2.16it/s, est. speed input: 878.93 toks/s, output: 144.93 toks/s]
Processed prompts:  68%|██████▊   | 13/19 [00:11<00:01,  3.01it/s, est. speed input: 1010.07 toks/s, output: 181.27 toks/s]
Processed prompts:  74%|███████▎  | 14/19 [00:11<00:01,  3.55it/s, est. speed input: 1077.14 toks/s, output: 200.48 toks/s]
Processed prompts:  79%|███████▉  | 15/19 [00:12<00:01,  3.79it/s, est. speed input: 1133.77 toks/s, output: 218.56 toks/s]
Processed prompts:  84%|████████▍ | 16/19 [00:13<00:01,  1.97it/s, est. speed input: 1101.91 toks/s, output: 224.86 toks/s]
Processed prompts:  89%|████████▉ | 17/19 [00:13<00:01,  1.96it/s, est. speed input: 1126.68 toks/s, output: 243.76 toks/s]
Processed prompts:  95%|█████████▍| 18/19 [00:31<00:05,  5.26s/it, est. speed input: 527.63 toks/s, output: 173.56 toks/s] 
Processed prompts: 100%|██████████| 19/19 [00:31<00:00,  1.64s/it, est. speed input: 556.94 toks/s, output: 239.32 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 34.03s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.00s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 36.43s
+   平均: 1.92s/页
+============================================================
+INFO:     127.0.0.1:56556 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_154048.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8707_20260204_154048.log
+INFO 02-04 15:40:52 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 15:40:57 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 15:40:57 [config.py:721] This model supports multiple tasks: {'generate', 'classify', 'embed', 'score', 'reward'}. Defaulting to 'generate'.
+INFO 02-04 15:40:57 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 15:40:58 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 15:40:58 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 15:40:58 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 15:40:58 [worker_base.py:653] ########## 41027 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 15:40:58 [worker_base.py:654] ########## 41027 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 15:40:58.675812 41027 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 15:40:58.675896 41027 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 15:40:58.676411 41027 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 15:40:58.676433 41027 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 15:40:58.695976 41027 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 15:40:58.696019 41027 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 15:40:58.697266 41027 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 15:40:58.697289 41027 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 15:40:58.698205 41027 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 15:40:58.698228 41027 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 15:40:58 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 15:40:58 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 15:40:59 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.95it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.93it/s]
+INFO 02-04 15:41:02 [loader.py:460] Loading weights took 2.10 seconds
+INFO 02-04 15:41:02 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.532530 seconds
+Some kwargs in processor config are unused and will not have any effect: mask_prompt, sft_format, normalize, add_special_token, image_mean, candidate_resolutions, image_token, ignore_id, pad_token, patch_size, image_std, downsample_ratio. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 15:41:16 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 15:41:17 [worker.py:287] Memory profiling takes 14.39 seconds
+INFO 02-04 15:41:17 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
+INFO 02-04 15:41:17 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 48.20GiB.
+INFO 02-04 15:41:17 [executor_base.py:112] # rocm blocks: 13162, # CPU blocks: 1092
+INFO 02-04 15:41:17 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 102.83x
+INFO 02-04 15:41:19 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:02,  1.93it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:00<00:01,  2.04it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:01<00:01,  2.00it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.99it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:02<00:00,  2.02it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00,  2.06it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00,  2.03it/s]
+INFO 02-04 15:41:22 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
+INFO 02-04 15:41:22 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.95 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 2 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8707
+[INFO] 接口文档: http://0.0.0.0:8707/docs
+INFO:     Started server process [41027]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
+Some kwargs in processor config are unused and will not have any effect: mask_prompt, sft_format, normalize, add_special_token, image_mean, candidate_resolutions, image_token, ignore_id, pad_token, patch_size, image_std, downsample_ratio. 
+   [1/3] Tokenize 22 页...
+   [1/3] Tokenize 完成
+   [2/3] GPU 批量推理 22 页...
+
Processed prompts:   0%|          | 0/22 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts:   5%|▍         | 1/22 [00:11<03:56, 11.26s/it, est. speed input: 81.10 toks/s, output: 9.42 toks/s]
Processed prompts:   9%|▉         | 2/22 [00:11<01:36,  4.81s/it, est. speed input: 157.99 toks/s, output: 18.43 toks/s]
Processed prompts:  14%|█▎        | 3/22 [00:16<01:32,  4.89s/it, est. speed input: 165.63 toks/s, output: 27.33 toks/s]
Processed prompts:  18%|█▊        | 4/22 [00:17<00:57,  3.17s/it, est. speed input: 213.95 toks/s, output: 41.42 toks/s]
Processed prompts:  23%|██▎       | 5/22 [00:18<00:44,  2.60s/it, est. speed input: 244.66 toks/s, output: 54.18 toks/s]
Processed prompts:  27%|██▋       | 6/22 [00:25<01:06,  4.16s/it, est. speed input: 211.89 toks/s, output: 59.57 toks/s]
Processed prompts:  32%|███▏      | 7/22 [00:27<00:49,  3.27s/it, est. speed input: 234.09 toks/s, output: 77.58 toks/s]
Processed prompts:  36%|███▋      | 8/22 [00:27<00:31,  2.28s/it, est. speed input: 266.07 toks/s, output: 98.39 toks/s]
Processed prompts:  41%|████      | 9/22 [00:32<00:41,  3.22s/it, est. speed input: 250.98 toks/s, output: 105.90 toks/s]
Processed prompts:  45%|████▌     | 10/22 [00:34<00:33,  2.83s/it, est. speed input: 263.15 toks/s, output: 123.97 toks/s]
Processed prompts:  50%|█████     | 11/22 [00:35<00:23,  2.15s/it, est. speed input: 284.47 toks/s, output: 146.07 toks/s]
Processed prompts:  55%|█████▍    | 12/22 [00:35<00:16,  1.68s/it, est. speed input: 305.09 toks/s, output: 168.08 toks/s]
Processed prompts:  59%|█████▉    | 13/22 [00:37<00:13,  1.52s/it, est. speed input: 320.18 toks/s, output: 187.75 toks/s]
Processed prompts:  64%|██████▎   | 14/22 [00:39<00:15,  1.93s/it, est. speed input: 319.90 toks/s, output: 200.32 toks/s]
Processed prompts:  68%|██████▊   | 15/22 [00:41<00:11,  1.68s/it, est. speed input: 333.73 toks/s, output: 221.64 toks/s]
Processed prompts:  73%|███████▎  | 16/22 [00:41<00:08,  1.34s/it, est. speed input: 351.28 toks/s, output: 243.00 toks/s]
Processed prompts:  77%|███████▋  | 17/22 [00:41<00:04,  1.03it/s, est. speed input: 372.11 toks/s, output: 269.16 toks/s]
Processed prompts:  82%|████████▏ | 18/22 [00:41<00:03,  1.32it/s, est. speed input: 391.52 toks/s, output: 294.53 toks/s]
Processed prompts:  86%|████████▋ | 19/22 [00:43<00:03,  1.07s/it, est. speed input: 396.42 toks/s, output: 311.04 toks/s]
Processed prompts:  91%|█████████ | 20/22 [00:45<00:02,  1.38s/it, est. speed input: 398.24 toks/s, output: 324.74 toks/s]
Processed prompts:  95%|█████████▌| 21/22 [00:51<00:02,  2.75s/it, est. speed input: 370.18 toks/s, output: 324.88 toks/s]
Processed prompts: 100%|██████████| 22/22 [01:37<00:00, 15.73s/it, est. speed input: 205.39 toks/s, output: 246.51 toks/s]
Processed prompts: 100%|██████████| 22/22 [01:37<00:00,  4.45s/it, est. speed input: 205.39 toks/s, output: 246.51 toks/s]
+   [2/3] GPU 推理完成
+   OCR 耗时: 104.02s
+   [3/3] 后处理...
+   [3/3] 后处理完成 (0.01s)
+============================================================
+[SUCCESS] 全部完成
+   总耗时: 115.46s
+   平均: 5.25s/页
+============================================================
+INFO:     127.0.0.1:43306 - "POST /ocr HTTP/1.1" 200 OK
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_170238.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_170238.log
+INFO 02-04 17:02:43 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 17:02:48 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 17:02:48 [config.py:721] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
+INFO 02-04 17:02:48 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 17:02:49 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 17:02:49 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 17:02:49 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 17:02:49 [worker_base.py:653] ########## 43881 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 17:02:49 [worker_base.py:654] ########## 43881 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 17:02:49.512413 43881 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 17:02:49.512501 43881 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:02:49.512948 43881 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 17:02:49.512964 43881 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:02:49.532961 43881 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 17:02:49.533003 43881 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:02:49.534246 43881 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 17:02:49.534269 43881 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:02:49.535202 43881 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 17:02:49.535223 43881 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 17:02:49 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 17:02:49 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 17:02:50 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 509, in <module>
+[rank0]:     main()
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 500, in main
+[rank0]:     initialize_model(args.model_path, args.gpu_id)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 270, in initialize_model
+[rank0]:     llm = LLM(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
+[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
+[rank0]:     return engine_cls.from_vllm_config(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
+[rank0]:     return cls(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 283, in __init__
+[rank0]:     self.model_executor = executor_class(vllm_config=vllm_config)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 52, in __init__
+[rank0]:     self._init_executor()
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
+[rank0]:     self.collective_rpc("load_model")
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
+[rank0]:     answer = run_method(self.driver_worker, method, args, kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 203, in load_model
+[rank0]:     self.model_runner.load_model()
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1136, in load_model
+[rank0]:     self.model = get_model(vllm_config=self.vllm_config)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
+[rank0]:     return loader.load_model(vllm_config=vllm_config)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 454, in load_model
+[rank0]:     model = _initialize_model(vllm_config=vllm_config)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 133, in _initialize_model
+[rank0]:     return model_class(vllm_config=vllm_config, prefix=prefix)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 321, in __init__
+[rank0]:     self.language_model = init_vllm_registered_model(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 286, in init_vllm_registered_model
+[rank0]:     return _initialize_model(vllm_config=vllm_config, prefix=prefix)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 133, in _initialize_model
+[rank0]:     return model_class(vllm_config=vllm_config, prefix=prefix)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 457, in __init__
+[rank0]:     self.model = DeepseekModel(vllm_config=vllm_config,
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 358, in __init__
+[rank0]:     self.start_layer, self.end_layer, self.layers = make_layers(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 609, in make_layers
+[rank0]:     [PPMissingLayer() for _ in range(start_layer)] + [
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 610, in <listcomp>
+[rank0]:     maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 360, in <lambda>
+[rank0]:     lambda prefix: DeepseekDecoderLayer(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 300, in __init__
+[rank0]:     self.mlp = DeepseekMoE(config=config,
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 120, in __init__
+[rank0]:     self.pack_params()
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 144, in pack_params
+[rank0]:     self.w1 = torch._utils._flatten_dense_tensors(w1)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 510, in _flatten_dense_tensors
+[rank0]:     return torch._C._nn.flatten_dense_tensors(tensors)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_device.py", line 79, in __torch_function__
+[rank0]:     return func(*args, **kwargs)
+[rank0]: torch.OutOfMemoryError: HIP out of memory. Tried to allocate 280.00 MiB. GPU 0 has a total capacity of 63.98 GiB of which 0 bytes is free. Of the allocated memory 3.30 GiB is allocated by PyTorch, and 15.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_HIP_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_171933.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_171933.log
+INFO 02-04 17:19:38 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 17:19:43 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 17:19:43 [config.py:721] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'generate'.
+INFO 02-04 17:19:43 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 17:19:43 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 17:19:43 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 17:19:43 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 17:19:43 [worker_base.py:653] ########## 44951 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 17:19:43 [worker_base.py:654] ########## 44951 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 17:19:43.998072 44951 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 17:19:43.998150 44951 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:19:43.998600 44951 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 17:19:43.998611 44951 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:19:44.017944 44951 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 17:19:44.017983 44951 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:19:44.019214 44951 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 17:19:44.019232 44951 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:19:44.020287 44951 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 17:19:44.020305 44951 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 17:19:44 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 17:19:44 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 17:19:44 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.88it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  6.86it/s]
+INFO 02-04 17:19:47 [loader.py:460] Loading weights took 2.11 seconds
+INFO 02-04 17:19:47 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.538627 seconds
+Some kwargs in processor config are unused and will not have any effect: add_special_token, pad_token, downsample_ratio, ignore_id, mask_prompt, image_std, image_mean, normalize, patch_size, image_token, sft_format, candidate_resolutions. 
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 509, in <module>
+[rank0]:     main()
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 500, in main
+[rank0]:     initialize_model(args.model_path, args.gpu_id)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 270, in initialize_model
+[rank0]:     llm = LLM(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
+[rank0]:     return fn(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
+[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
+[rank0]:     return engine_cls.from_vllm_config(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
+[rank0]:     return cls(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 286, in __init__
+[rank0]:     self._initialize_kv_caches()
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 432, in _initialize_kv_caches
+[rank0]:     self.model_executor.determine_num_available_blocks())
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 103, in determine_num_available_blocks
+[rank0]:     results = self.collective_rpc("determine_num_available_blocks")
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
+[rank0]:     answer = run_method(self.driver_worker, method, args, kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 249, in determine_num_available_blocks
+[rank0]:     self.model_runner.profile_run()
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1262, in profile_run
+[rank0]:     self._dummy_run(max_num_batched_tokens, max_num_seqs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1388, in _dummy_run
+[rank0]:     self.execute_model(model_input, kv_caches, intermediate_tensors)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+[rank0]:     return func(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1948, in execute_model
+[rank0]:     hidden_or_intermediate_states = model_executable(
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 543, in forward
+[rank0]:     vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 503, in get_multimodal_embeddings
+[rank0]:     vision_embeddings = self._process_image_input(image_input)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 486, in _process_image_input
+[rank0]:     vision_features = self._pixel_values_to_embedding(
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 394, in _pixel_values_to_embedding
+[rank0]:     local_features_1 = self.sam_model(patches)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 176, in forward
+[rank0]:     x = blk(x)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 241, in forward
+[rank0]:     x = self.attn(x)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 294, in forward
+[rank0]:     qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
+[rank0]:     return self._call_impl(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
+[rank0]:     return forward_call(*args, **kwargs)
+[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py", line 117, in forward
+[rank0]:     return F.linear(input, self.weight, self.bias)
+[rank0]: RuntimeError: CUDA error: HIPBLAS_STATUS_ALLOC_FAILED when calling `hipblasCreate(handle)`
--- a/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_172022.log
+++ b/DeepSeek-OCR-vllm/logs/deepseek_ocr_server_8708_20260204_172022.log
+INFO 02-04 17:20:27 [__init__.py:240] Automatically detected platform rocm.
+/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning: 
+        on_event is deprecated, use lifespan event handlers instead.
+        Read more about it in the
+        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
+  @app.on_event("shutdown")
+[INFO] 加载模型: /home/lst/deepseek_ocr
+INFO 02-04 17:20:32 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
+INFO 02-04 17:20:32 [config.py:721] This model supports multiple tasks: {'reward', 'embed', 'score', 'classify', 'generate'}. Defaulting to 'generate'.
+INFO 02-04 17:20:32 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False, 
+INFO 02-04 17:20:33 [rocm.py:226] None is not supported in AMD GPUs.
+INFO 02-04 17:20:33 [rocm.py:227] Using ROCmFlashAttention backend.
+WARNING 02-04 17:20:33 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
+INFO 02-04 17:20:33 [worker_base.py:653] ########## 45841 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
+INFO 02-04 17:20:33 [worker_base.py:654] ########## 45841 process(rank0) is running on memnode(s): {0, 1, 2, 3}
+WARNING: Logging before InitGoogleLogging() is written to STDERR
+I0204 17:20:33.298095 45841 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
+I0204 17:20:33.298182 45841 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:20:33.298664 45841 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
+I0204 17:20:33.298677 45841 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:20:33.306862 45841 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
+I0204 17:20:33.306901 45841 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:20:33.308358 45841 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
+I0204 17:20:33.308387 45841 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+I0204 17:20:33.309455 45841 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
+I0204 17:20:33.309473 45841 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
+INFO 02-04 17:20:33 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
+INFO 02-04 17:20:33 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
+INFO 02-04 17:20:34 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
+
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.08it/s]
+
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  7.06it/s]
+INFO 02-04 17:20:36 [loader.py:460] Loading weights took 2.12 seconds
+INFO 02-04 17:20:37 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.572350 seconds
+Some kwargs in processor config are unused and will not have any effect: normalize, sft_format, image_token, mask_prompt, ignore_id, downsample_ratio, patch_size, candidate_resolutions, image_mean, pad_token, add_special_token, image_std. 
+/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
+  x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+WARNING 02-04 17:20:51 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
+INFO 02-04 17:20:51 [worker.py:287] Memory profiling takes 14.59 seconds
+INFO 02-04 17:20:51 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
+INFO 02-04 17:20:51 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 48.20GiB.
+INFO 02-04 17:20:52 [executor_base.py:112] # rocm blocks: 13162, # CPU blocks: 1092
+INFO 02-04 17:20:52 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 102.83x
+INFO 02-04 17:20:54 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
+
Capturing CUDA graph shapes:   0%|          | 0/6 [00:00<?, ?it/s]
Capturing CUDA graph shapes:  17%|█▋        | 1/6 [00:00<00:03,  1.46it/s]
Capturing CUDA graph shapes:  33%|███▎      | 2/6 [00:01<00:02,  1.51it/s]
Capturing CUDA graph shapes:  50%|█████     | 3/6 [00:02<00:02,  1.48it/s]
Capturing CUDA graph shapes:  67%|██████▋   | 4/6 [00:02<00:01,  1.62it/s]
Capturing CUDA graph shapes:  83%|████████▎ | 5/6 [00:03<00:00,  1.72it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.84it/s]
Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00,  1.70it/s]
+INFO 02-04 17:20:57 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
+INFO 02-04 17:20:57 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 20.50 seconds
+[SUCCESS] 模型加载完成
+[INFO] 线程池配置:
+   - CPU 线程池: 2 线程
+   - GPU 线程池: 1 线程
+[INFO] 服务启动: http://0.0.0.0:8708
+[INFO] 接口文档: http://0.0.0.0:8708/docs
+INFO:     Started server process [45841]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8708 (Press CTRL+C to quit)
--- a/DeepSeek-OCR-vllm/offline_test.sh
+++ b/DeepSeek-OCR-vllm/offline_test.sh
+export VLLM_USE_V1=0
+export HIP_VISIBLE_DEVICES=0
+# image：流式输出
+#python run_dpsk_ocr_image.py
+# pdf
+python run_dpsk_ocr_pdf.py
--- a/DeepSeek-OCR-vllm/online_test.py
+++ b/DeepSeek-OCR-vllm/online_test.py
+import requests
+from pathlib import Path
+from config import INPUT_PATH,OUTPUT_PATH
+import os
+def ocr_pdf(pdf_path, server_url="http://localhost:8002", save_result=True):
+    """
+    对 PDF 文档进行 OCR 识别
+    参数:
+        pdf_path: PDF 文件路径
+        server_url: OCR 服务地址
+        save_result: 是否保存识别结果到文件
+    返回:
+        dict: 包含识别结果的字典
+    """
+    # 1. 检查文件
+    pdf_file = Path(pdf_path)
+    if not pdf_file.exists():
+        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
+    if pdf_file.suffix.lower() != '.pdf':
+        raise ValueError(f"不是 PDF 文件: {pdf_path}")
+    # 2. 读取文件大小（用于显示）
+    file_size_mb = pdf_file.stat().st_size / (1024 * 1024)
+    print(f"文件名: {pdf_file.name}")
+    print(f"文件大小: {file_size_mb:.2f} MB")
+    print(f"开始处理...")
+    # 3. 准备请求
+    api_url = f"{server_url}/ocr"
+    # 4. 发送请求
+    with open(pdf_path, 'rb') as f:
+        files = {'file': (pdf_file.name, f, 'application/pdf')}
+        # 这里可以添加额外参数
+        # data = {'enable_description': True}  # 启用图片描述（会增加处理时间）
+        response = requests.post(api_url, files=files)
+    # 5. 处理结果
+    if response.status_code == 200:
+        result = response.json()
+        print(f"处理完成！")
+        print(f"统计信息:")
+        print(f"   - 总页数: {result['page_count']} 页")
+        print(f"   - 处理耗时: {result['processing_time']:.2f} 秒")
+        print(f"   - 平均速度: {result['processing_time'] / result['page_count']:.2f} 秒/页")
+        # 6. 保存结果到文件
+        if save_result:
+            os.makedirs(OUTPUT_PATH, exist_ok=True)
+            output_file = pdf_file.with_suffix('.md')
+            file_path = os.path.join(OUTPUT_PATH, output_file)
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(result['markdown'])
+            print(f"结果已保存到: {output_file}")
+        return result
+    else:
+        print(f"处理失败: {response.status_code}")
+        print(f"错误信息: {response.text}")
+        return None
+# 使用示例
+# 替换为你的实际 PDF 路径
+#pdf_file = "./doc/DeepSeek_OCR_paper_layouts.pdf"
+pdf_file = INPUT_PATH
+# 调用 OCR 函数
+result = ocr_pdf(pdf_file)
+if result:
+    # 显示部分识别结果
+    print("\n" + "="*60)
+    print("识别结果预览:")
+    print("="*60)
+    # 显示前 1000 个字符
+    preview_text = result['markdown'][:1000]
+    print(preview_text)
+    if len(result['markdown']) > 1000:
+        print("\n... (内容过长，已截断)")
+        print(f"\n完整内容共 {len(result['markdown'])} 个字符")
--- a/DeepSeek-OCR-vllm/process/__init__.py
+++ b/DeepSeek-OCR-vllm/process/__init__.py
--- a/DeepSeek-OCR-vllm/process/__pycache__/__init__.cpython-310.pyc
+++ b/DeepSeek-OCR-vllm/process/__pycache__/__init__.cpython-310.pyc
--- a/DeepSeek-OCR-vllm/process/__pycache__/image_process.cpython-310.pyc
+++ b/DeepSeek-OCR-vllm/process/__pycache__/image_process.cpython-310.pyc
--- a/DeepSeek-OCR-vllm/process/__pycache__/ngram_norepeat.cpython-310.pyc
+++ b/DeepSeek-OCR-vllm/process/__pycache__/ngram_norepeat.cpython-310.pyc
--- a/DeepSeek-OCR-vllm/process/image_process.py
+++ b/DeepSeek-OCR-vllm/process/image_process.py
+import math
+from typing import List, Tuple
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+from config import IMAGE_SIZE, BASE_SIZE, CROP_MODE, MIN_CROPS, MAX_CROPS, PROMPT, TOKENIZER
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+def count_tiles(orig_width, orig_height, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    # print(target_ratios)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    return target_aspect_ratio
+def dynamic_preprocess(image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    # print(target_ratios)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # print(target_aspect_ratio)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+class ImageTransform:
+    def __init__(self,
+                 mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+                 normalize: bool = True):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+        transform_pipelines = [T.ToTensor()]
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+        self.transform = T.Compose(transform_pipelines)
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+class DeepseekOCRProcessor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast = TOKENIZER,
+        candidate_resolutions: Tuple[Tuple[int, int]] = [[1024, 1024]],
+        patch_size: int = 16,
+        downsample_ratio: int = 4,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        # self.candidate_resolutions = candidate_resolutions # placeholder no use
+        self.image_size = IMAGE_SIZE
+        self.base_size = BASE_SIZE
+        # self.patch_size = patch_size
+        self.patch_size = 16 
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        # self.downsample_ratio = downsample_ratio
+        self.downsample_ratio = 4
+        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.tokenizer = tokenizer
+        # self.tokenizer = add_special_token(tokenizer)
+        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+        # add image token
+        # image_token_id = self.tokenizer.vocab.get(image_token)
+        # if image_token_id is None:
+        #     special_tokens = [image_token]
+        #     special_tokens_dict = {"additional_special_tokens": special_tokens}
+        #     self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        # special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        # special_tokens_dict = {"additional_special_tokens": special_tokens}
+        # special_tokens = ['<image>','<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>', '<td>', '</td>', '<tr>', '</tr>']
+        # special_tokens_dict = {"additional_special_tokens": special_tokens}
+        # self.tokenizer.add_special_tokens(special_tokens_dict)
+        # # add special tokens for SFT data
+        # special_tokens = ["<|User|>", "<|Assistant|>"]
+        # special_tokens_dict = {"additional_special_tokens": special_tokens}
+        # self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+    # def select_best_resolution(self, image_size):
+    #     # used for cropping
+    #     original_width, original_height = image_size
+    #     best_fit = None
+    #     max_effective_resolution = 0
+    #     min_wasted_resolution = float("inf")
+    #     for width, height in self.candidate_resolutions:
+    #         scale = min(width / original_width, height / original_height)
+    #         downscaled_width, downscaled_height = int(
+    #             original_width * scale), int(original_height * scale)
+    #         effective_resolution = min(downscaled_width * downscaled_height,
+    #                                    original_width * original_height)
+    #         wasted_resolution = (width * height) - effective_resolution
+    #         if effective_resolution > max_effective_resolution or (
+    #                 effective_resolution == max_effective_resolution
+    #                 and wasted_resolution < min_wasted_resolution):
+    #             max_effective_resolution = effective_resolution
+    #             min_wasted_resolution = wasted_resolution
+    #             best_fit = (width, height)
+    #     return best_fit
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+    def process_one(
+        self,
+        prompt: str,
+        images: List,
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+        assert (prompt is not None and images is not None
+                ), "prompt and images must be used at the same time."
+        sft_format = prompt
+        input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, _ = images[0]
+        return {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "images_crop": images_crop,
+            "images_seq_mask": images_seq_mask,
+            "images_spatial_crop": images_spatial_crop,
+            "num_image_tokens": num_image_tokens,
+        }
+        # prepare = BatchFeature(
+        #     data=dict(
+        #         input_ids=input_ids,
+        #         pixel_values=pixel_values,
+        #         images_crop = images_crop,
+        #         images_seq_mask=images_seq_mask,
+        #         images_spatial_crop=images_spatial_crop,
+        #         num_image_tokens=num_image_tokens,
+        #     ),
+        #     tensor_type="pt",
+        # )
+        # return prepare
+    def __call__(
+        self,
+        *,
+        prompt: str,
+        images: List,
+        inference_mode: bool = True,
+        **kwargs,
+    ):
+        """
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+        prepare = self.process_one(
+            prompt=prompt,
+            images=images,
+            inference_mode=inference_mode,
+        )
+        return prepare
+    def tokenize_with_images(
+        self,
+        # conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        # print(conversation)
+        conversation = PROMPT
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_crop_list, images_seq_mask, images_spatial_crop = [], [], [], []
+        image_shapes = []
+        num_image_tokens = []
+        tokenized_str = []
+        # print('image: ', len(images))
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+            """select best resolution for anyres"""
+            # if cropping:
+            #     best_width, best_height = self.select_best_resolution(image.size)
+            # else:
+            #     best_width, best_height = self.image_size, self.image_size
+            image_shapes.append(image.size)
+            if image.size[0] <= 640 and image.size[1] <= 640:
+                crop_ratio = [1, 1]
+            else:
+                if cropping:
+                    # print('image-size: ', image.size)
+                    # best_width, best_height = select_best_resolution(image.size, self.candidate_resolutions)
+                    # print('image ', image.size)
+                    # print('open_size:', image.size)
+                    images_crop_raw, crop_ratio = dynamic_preprocess(image, image_size=IMAGE_SIZE)
+                    # print('crop_ratio: ', crop_ratio)
+                else:
+                    # best_width, best_height = self.image_size, self.image_size
+                    crop_ratio = [1, 1]
+            # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
+            # print(crop_ratio)
+            """process the global view"""
+            # if cropping
+            if self.image_size <= 640 and not cropping:
+                # print('directly resize')
+                image = image.resize((self.image_size, self.image_size))
+            global_view = ImageOps.pad(image, (self.base_size, self.base_size),
+                                    color=tuple(int(x * 255) for x in self.image_transform.mean))
+            images_list.append(self.image_transform(global_view))
+            """record height / width crop num"""
+            # width_crop_num, height_crop_num = best_width // self.image_size, best_height // self.image_size
+            num_width_tiles, num_height_tiles = crop_ratio
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+            if num_width_tiles > 1 or num_height_tiles > 1:
+                """process the local views"""
+                # local_view = ImageOps.pad(image, (best_width, best_height),
+                #                         color=tuple(int(x * 255) for x in self.image_transform.mean))
+                # for i in range(0, best_height, self.image_size):
+                #     for j in range(0, best_width, self.image_size):
+                #         images_crop_list.append(
+                #             self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+                for i in range(len(images_crop_raw)):
+                    images_crop_list.append(self.image_transform(images_crop_raw[i]))
+            # """process the global view"""
+            # global_view = ImageOps.pad(image, (self.image_size, self.image_size),
+            #                            color=tuple(int(x * 255) for x in self.image_transform.mean))
+            # images_list.append(self.image_transform(global_view))
+            # """process the local views"""
+            # local_view = ImageOps.pad(image, (best_width, best_height),
+            #                           color=tuple(int(x * 255) for x in self.image_transform.mean))
+            # for i in range(0, best_height, self.image_size):
+            #     for j in range(0, best_width, self.image_size):
+            #         images_list.append(
+            #             self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+            # """add image tokens"""
+            """add image tokens"""
+            num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
+            tokenized_image = ([self.image_token_id] * num_queries_base + [self.image_token_id]) * num_queries_base
+            tokenized_image += [self.image_token_id]
+            if num_width_tiles > 1 or num_height_tiles > 1:
+                tokenized_image += ([self.image_token_id] * (num_queries * num_width_tiles) + [self.image_token_id]) * (
+                            num_queries * num_height_tiles)
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+        assert len(tokenized_str) == len(
+            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
+            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) |
+                   (input_ids == self.image_token_id)] = self.ignore_id
+        input_ids[input_ids < 0] = self.pad_id
+        inference_mode = True
+        if inference_mode:
+            # Remove the ending eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
+            images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
+            images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+            if images_crop_list:
+                images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
+            else:
+                images_crop = torch.zeros((1, 3, self.image_size, self.image_size)).unsqueeze(0)
+        input_ids = input_ids.unsqueeze(0)
+        return [[input_ids, pixel_values, images_crop, images_seq_mask, images_spatial_crop, num_image_tokens, image_shapes]]
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekOCRProcessor)