Commit 6ad287f7 authored by liuxu3's avatar liuxu3
Browse files

added DeepSeek OCR API by liushengtong

parent 80c11a03
INFO 02-04 11:46:35 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 11:46:40 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 11:46:40 [config.py:721] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 02-04 11:46:40 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 11:46:41 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 11:46:41 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 11:46:41 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 11:46:41 [worker_base.py:653] ########## 2639 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 11:46:41 [worker_base.py:654] ########## 2639 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 11:46:41.823388 2639 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 11:46:41.823468 2639 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:46:41.824054 2639 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 11:46:41.824072 2639 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:46:41.843951 2639 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 11:46:41.843997 2639 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:46:41.845223 2639 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 11:46:41.845247 2639 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:46:41.846169 2639 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x56499617e120, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 11:46:41.846191 2639 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 11:46:41 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 11:46:41 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 11:46:42 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.02it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.00it/s]
INFO 02-04 11:46:45 [loader.py:460] Loading weights took 2.10 seconds
INFO 02-04 11:46:45 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.525227 seconds
Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, downsample_ratio, patch_size, ignore_id, mask_prompt, normalize, pad_token, add_special_token, image_std, image_mean, sft_format, image_token.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 11:46:58 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 11:46:58 [worker.py:287] Memory profiling takes 12.97 seconds
INFO 02-04 11:46:58 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.50) = 31.99GiB
INFO 02-04 11:46:58 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 22.76GiB.
INFO 02-04 11:46:58 [executor_base.py:112] # rocm blocks: 1553, # CPU blocks: 273
INFO 02-04 11:46:58 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 121.17x
INFO 02-04 11:47:01 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:03, 1.29it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.51it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:02<00:02, 1.49it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.49it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:03<00:00, 1.50it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.59it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.53it/s]
INFO 02-04 11:47:04 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
INFO 02-04 11:47:04 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.41 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 1 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [2639]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, downsample_ratio, patch_size, ignore_id, mask_prompt, normalize, pad_token, add_special_token, image_std, image_mean, sft_format, image_token.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:26, 8.13s/it, est. speed input: 112.34 toks/s, output: 1.35 toks/s] Processed prompts: 11%|█ | 2/19 [00:08<01:00, 3.53s/it, est. speed input: 216.37 toks/s, output: 4.27 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:08<00:34, 2.13s/it, est. speed input: 307.95 toks/s, output: 9.11 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:09<00:16, 1.21s/it, est. speed input: 460.07 toks/s, output: 23.18 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:10<00:08, 1.43it/s, est. speed input: 635.34 toks/s, output: 44.04 toks/s] Processed prompts: 42%|████▏ | 8/19 [00:10<00:06, 1.63it/s, est. speed input: 702.70 toks/s, output: 55.03 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:10<00:05, 1.88it/s, est. speed input: 768.38 toks/s, output: 67.14 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:10<00:04, 2.17it/s, est. speed input: 833.25 toks/s, output: 80.22 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:11<00:03, 2.61it/s, est. speed input: 901.61 toks/s, output: 94.26 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:11<00:02, 2.96it/s, est. speed input: 964.32 toks/s, output: 108.70 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:12<00:02, 2.30it/s, est. speed input: 986.28 toks/s, output: 121.65 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:12<00:01, 2.81it/s, est. speed input: 1047.64 toks/s, output: 139.66 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:12<00:01, 3.10it/s, est. speed input: 1100.53 toks/s, output: 157.59 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:12<00:01, 2.82it/s, est. speed input: 1134.68 toks/s, output: 175.08 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:02, 1.43s/it, est. speed input: 921.20 toks/s, output: 169.57 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:20<00:01, 1.98s/it, est. speed input: 816.43 toks/s, output: 187.84 toks/s] Processed prompts: 100%|██████████| 19/19 [00:22<00:00, 2.17s/it, est. speed input: 762.83 toks/s, output: 220.97 toks/s] Processed prompts: 100%|██████████| 19/19 [00:22<00:00, 1.20s/it, est. speed input: 762.83 toks/s, output: 220.97 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 29.08s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 31.52s
平均: 1.66s/页
============================================================
INFO: 127.0.0.1:35920 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 11:59:24 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 11:59:29 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 11:59:29 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 02-04 11:59:29 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 11:59:29 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 11:59:29 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 11:59:29 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 11:59:29 [worker_base.py:653] ########## 3602 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 11:59:29 [worker_base.py:654] ########## 3602 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 11:59:29.986018 3602 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 11:59:29.986095 3602 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:59:29.986546 3602 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 11:59:29.986562 3602 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:59:30.005985 3602 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 11:59:30.006026 3602 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:59:30.007256 3602 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 11:59:30.007274 3602 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 11:59:30.008291 3602 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55947c9e72b0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 11:59:30.008311 3602 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 11:59:30 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 11:59:30 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 11:59:30 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.07it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.05it/s]
INFO 02-04 11:59:33 [loader.py:460] Loading weights took 2.07 seconds
INFO 02-04 11:59:33 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.502876 seconds
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_token, ignore_id, patch_size, candidate_resolutions, sft_format, pad_token, image_mean, normalize, add_special_token, mask_prompt, image_std.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 11:59:47 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 11:59:48 [worker.py:287] Memory profiling takes 14.62 seconds
INFO 02-04 11:59:48 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
INFO 02-04 11:59:48 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
INFO 02-04 11:59:48 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
INFO 02-04 11:59:48 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
INFO 02-04 11:59:50 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:02, 1.83it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:01, 2.00it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 2.05it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:01<00:00, 2.02it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 2.04it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.08it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.04it/s]
INFO 02-04 11:59:53 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-04 11:59:53 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 20.09 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 1 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [3602]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, image_token, ignore_id, patch_size, candidate_resolutions, sft_format, pad_token, image_mean, normalize, add_special_token, mask_prompt, image_std.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:27, 8.19s/it, est. speed input: 111.49 toks/s, output: 2.20 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:20, 4.76s/it, est. speed input: 173.20 toks/s, output: 10.91 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:46, 2.88s/it, est. speed input: 244.92 toks/s, output: 20.83 toks/s] Processed prompts: 21%|██ | 4/19 [00:12<00:31, 2.11s/it, est. speed input: 301.48 toks/s, output: 31.70 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.44s/it, est. speed input: 369.13 toks/s, output: 43.99 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:13, 1.05s/it, est. speed input: 432.44 toks/s, output: 56.44 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.30it/s, est. speed input: 497.05 toks/s, output: 69.45 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:12<00:04, 2.36it/s, est. speed input: 633.98 toks/s, output: 96.91 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.66it/s, est. speed input: 691.77 toks/s, output: 109.71 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:13<00:03, 2.07it/s, est. speed input: 718.61 toks/s, output: 119.85 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:14<00:03, 2.27it/s, est. speed input: 765.90 toks/s, output: 134.08 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.85it/s, est. speed input: 822.49 toks/s, output: 150.17 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.15it/s, est. speed input: 871.73 toks/s, output: 165.59 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 2.26it/s, est. speed input: 921.46 toks/s, output: 191.32 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:00, 2.15it/s, est. speed input: 947.29 toks/s, output: 208.12 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:04, 4.83s/it, est. speed input: 486.93 toks/s, output: 161.72 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.78s/it, est. speed input: 513.98 toks/s, output: 222.40 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 39.72s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 42.18s
平均: 2.22s/页
============================================================
INFO: 127.0.0.1:48266 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:15:20 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:15:25 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:15:25 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
INFO 02-04 13:15:25 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False,
INFO 02-04 13:15:26 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:15:26 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:15:26 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:15:26 [worker_base.py:653] ########## 4593 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:15:26 [worker_base.py:654] ########## 4593 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:15:26.794960 4593 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:15:26.795035 4593 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:15:26.795487 4593 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:15:26.795502 4593 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:15:26.814953 4593 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:15:26.814989 4593 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:15:26.816212 4593 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:15:26.816231 4593 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:15:26.817245 4593 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55747cc42d80, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:15:26.817265 4593 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:15:26 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:15:26 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:15:27 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.84it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.83it/s]
INFO 02-04 13:15:30 [loader.py:460] Loading weights took 2.13 seconds
INFO 02-04 13:15:30 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.562644 seconds
Some kwargs in processor config are unused and will not have any effect: ignore_id, pad_token, image_token, add_special_token, candidate_resolutions, mask_prompt, image_std, sft_format, patch_size, normalize, image_mean, downsample_ratio.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:15:44 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:15:45 [worker.py:287] Memory profiling takes 14.64 seconds
INFO 02-04 13:15:45 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
INFO 02-04 13:15:45 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
INFO 02-04 13:15:45 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
INFO 02-04 13:15:45 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
INFO 02-04 13:15:47 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/19 [00:00<?, ?it/s] Capturing CUDA graph shapes: 5%|▌ | 1/19 [00:00<00:08, 2.02it/s] Capturing CUDA graph shapes: 11%|█ | 2/19 [00:00<00:08, 2.09it/s] Capturing CUDA graph shapes: 16%|█▌ | 3/19 [00:01<00:07, 2.11it/s] Capturing CUDA graph shapes: 21%|██ | 4/19 [00:01<00:07, 2.06it/s] Capturing CUDA graph shapes: 26%|██▋ | 5/19 [00:02<00:06, 2.06it/s] Capturing CUDA graph shapes: 32%|███▏ | 6/19 [00:02<00:06, 2.09it/s] Capturing CUDA graph shapes: 37%|███▋ | 7/19 [00:03<00:05, 2.07it/s] Capturing CUDA graph shapes: 42%|████▏ | 8/19 [00:03<00:05, 2.05it/s] Capturing CUDA graph shapes: 47%|████▋ | 9/19 [00:04<00:04, 2.04it/s] Capturing CUDA graph shapes: 53%|█████▎ | 10/19 [00:04<00:04, 2.07it/s] Capturing CUDA graph shapes: 58%|█████▊ | 11/19 [00:05<00:03, 2.08it/s] Capturing CUDA graph shapes: 63%|██████▎ | 12/19 [00:05<00:03, 2.05it/s] Capturing CUDA graph shapes: 68%|██████▊ | 13/19 [00:06<00:02, 2.06it/s] Capturing CUDA graph shapes: 74%|███████▎ | 14/19 [00:06<00:02, 2.08it/s] Capturing CUDA graph shapes: 79%|███████▉ | 15/19 [00:07<00:01, 2.10it/s] Capturing CUDA graph shapes: 84%|████████▍ | 16/19 [00:07<00:01, 2.11it/s] Capturing CUDA graph shapes: 89%|████████▉ | 17/19 [00:08<00:00, 2.08it/s] Capturing CUDA graph shapes: 95%|█████████▍| 18/19 [00:08<00:00, 2.08it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.10it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.08it/s]
INFO 02-04 13:15:56 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
INFO 02-04 13:15:56 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 26.33 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 2 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [4593]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: ignore_id, pad_token, image_token, add_special_token, candidate_resolutions, mask_prompt, image_std, sft_format, patch_size, normalize, image_mean, downsample_ratio.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:28, 8.24s/it, est. speed input: 110.87 toks/s, output: 2.19 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:21, 4.78s/it, est. speed input: 172.44 toks/s, output: 10.86 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:46, 2.89s/it, est. speed input: 243.95 toks/s, output: 20.75 toks/s] Processed prompts: 21%|██ | 4/19 [00:12<00:31, 2.12s/it, est. speed input: 300.33 toks/s, output: 31.58 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.44s/it, est. speed input: 367.77 toks/s, output: 43.83 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:13, 1.05s/it, est. speed input: 430.89 toks/s, output: 56.24 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.30it/s, est. speed input: 495.35 toks/s, output: 69.21 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:13<00:04, 2.36it/s, est. speed input: 631.80 toks/s, output: 96.57 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.66it/s, est. speed input: 689.56 toks/s, output: 109.36 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:14<00:03, 2.06it/s, est. speed input: 716.25 toks/s, output: 119.46 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:14<00:03, 2.26it/s, est. speed input: 763.32 toks/s, output: 133.63 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.84it/s, est. speed input: 819.68 toks/s, output: 149.65 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.15it/s, est. speed input: 868.97 toks/s, output: 165.06 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 2.25it/s, est. speed input: 918.38 toks/s, output: 190.68 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:00, 2.15it/s, est. speed input: 944.32 toks/s, output: 207.47 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:04, 4.81s/it, est. speed input: 487.69 toks/s, output: 161.97 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.77s/it, est. speed input: 514.78 toks/s, output: 222.74 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 38.37s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 40.82s
平均: 2.15s/页
============================================================
INFO: 127.0.0.1:55486 - "POST /ocr HTTP/1.1" 200 OK
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:05<01:40, 5.58s/it, est. speed input: 163.69 toks/s, output: 3.23 toks/s] Processed prompts: 11%|█ | 2/19 [00:07<01:02, 3.68s/it, est. speed input: 230.47 toks/s, output: 14.51 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:08<00:36, 2.29s/it, est. speed input: 319.52 toks/s, output: 27.18 toks/s] Processed prompts: 21%|██ | 4/19 [00:09<00:26, 1.76s/it, est. speed input: 384.30 toks/s, output: 40.41 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:09<00:17, 1.21s/it, est. speed input: 467.80 toks/s, output: 55.75 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:10<00:11, 1.11it/s, est. speed input: 544.48 toks/s, output: 71.07 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:10<00:08, 1.49it/s, est. speed input: 623.53 toks/s, output: 87.12 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:10<00:03, 2.69it/s, est. speed input: 793.73 toks/s, output: 121.32 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:10<00:03, 2.97it/s, est. speed input: 862.33 toks/s, output: 136.76 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:11<00:03, 2.19it/s, est. speed input: 883.22 toks/s, output: 147.31 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:11<00:02, 2.37it/s, est. speed input: 935.98 toks/s, output: 163.86 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:11<00:02, 2.95it/s, est. speed input: 1003.15 toks/s, output: 183.15 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:12<00:01, 3.26it/s, est. speed input: 1059.75 toks/s, output: 201.30 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:13<00:01, 2.29it/s, est. speed input: 1102.22 toks/s, output: 228.85 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:13<00:00, 2.18it/s, est. speed input: 1126.22 toks/s, output: 247.43 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:31<00:04, 4.81s/it, est. speed input: 528.62 toks/s, output: 175.56 toks/s] Processed prompts: 100%|██████████| 19/19 [00:31<00:00, 1.64s/it, est. speed input: 557.98 toks/s, output: 241.44 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 33.96s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 36.41s
平均: 1.92s/页
============================================================
INFO: 127.0.0.1:35584 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:19:50 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:19:55 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:19:55 [config.py:721] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 02-04 13:19:55 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False,
INFO 02-04 13:19:56 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:19:56 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:19:56 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:19:56 [worker_base.py:653] ########## 5553 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:19:56 [worker_base.py:654] ########## 5553 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:19:56.829571 5553 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:19:56.829648 5553 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:19:56.830089 5553 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:19:56.830101 5553 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:19:56.849946 5553 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:19:56.849987 5553 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:19:56.851194 5553 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:19:56.851212 5553 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:19:56.852396 5553 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5569d16c6270, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:19:56.852418 5553 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:19:56 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:19:56 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:19:57 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.09it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.07it/s]
INFO 02-04 13:20:00 [loader.py:460] Loading weights took 2.10 seconds
INFO 02-04 13:20:00 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.520996 seconds
Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, image_std, image_token, pad_token, image_mean, candidate_resolutions, sft_format, downsample_ratio, ignore_id, patch_size, normalize.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:20:14 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:20:15 [worker.py:287] Memory profiling takes 14.27 seconds
INFO 02-04 13:20:15 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
INFO 02-04 13:20:15 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
INFO 02-04 13:20:15 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
INFO 02-04 13:20:15 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
INFO 02-04 13:20:17 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/19 [00:00<?, ?it/s] Capturing CUDA graph shapes: 5%|▌ | 1/19 [00:00<00:08, 2.03it/s] Capturing CUDA graph shapes: 11%|█ | 2/19 [00:00<00:08, 2.10it/s] Capturing CUDA graph shapes: 16%|█▌ | 3/19 [00:01<00:07, 2.12it/s] Capturing CUDA graph shapes: 21%|██ | 4/19 [00:01<00:07, 2.13it/s] Capturing CUDA graph shapes: 26%|██▋ | 5/19 [00:02<00:06, 2.11it/s] Capturing CUDA graph shapes: 32%|███▏ | 6/19 [00:02<00:06, 2.12it/s] Capturing CUDA graph shapes: 37%|███▋ | 7/19 [00:03<00:05, 2.13it/s] Capturing CUDA graph shapes: 42%|████▏ | 8/19 [00:03<00:05, 2.10it/s] Capturing CUDA graph shapes: 47%|████▋ | 9/19 [00:04<00:04, 2.08it/s] Capturing CUDA graph shapes: 53%|█████▎ | 10/19 [00:04<00:04, 2.09it/s] Capturing CUDA graph shapes: 58%|█████▊ | 11/19 [00:05<00:03, 2.11it/s] Capturing CUDA graph shapes: 63%|██████▎ | 12/19 [00:05<00:03, 2.11it/s] Capturing CUDA graph shapes: 68%|██████▊ | 13/19 [00:06<00:02, 2.12it/s] Capturing CUDA graph shapes: 74%|███████▎ | 14/19 [00:06<00:02, 2.12it/s] Capturing CUDA graph shapes: 79%|███████▉ | 15/19 [00:07<00:01, 2.12it/s] Capturing CUDA graph shapes: 84%|████████▍ | 16/19 [00:07<00:01, 2.12it/s] Capturing CUDA graph shapes: 89%|████████▉ | 17/19 [00:08<00:00, 2.08it/s] Capturing CUDA graph shapes: 95%|█████████▍| 18/19 [00:08<00:00, 2.08it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.04it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.09it/s]
INFO 02-04 13:20:26 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
INFO 02-04 13:20:26 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 25.91 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 1 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [5553]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: add_special_token, mask_prompt, image_std, image_token, pad_token, image_mean, candidate_resolutions, sft_format, downsample_ratio, ignore_id, patch_size, normalize.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:27, 8.19s/it, est. speed input: 111.42 toks/s, output: 2.20 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:20, 4.75s/it, est. speed input: 173.40 toks/s, output: 10.92 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:45, 2.87s/it, est. speed input: 245.20 toks/s, output: 20.86 toks/s] Processed prompts: 21%|██ | 4/19 [00:12<00:31, 2.10s/it, est. speed input: 301.95 toks/s, output: 31.75 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.44s/it, est. speed input: 369.65 toks/s, output: 44.05 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:13, 1.05s/it, est. speed input: 433.11 toks/s, output: 56.53 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.30it/s, est. speed input: 497.86 toks/s, output: 69.56 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:12<00:04, 2.37it/s, est. speed input: 635.00 toks/s, output: 97.06 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.68it/s, est. speed input: 693.07 toks/s, output: 109.92 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:13<00:03, 2.07it/s, est. speed input: 719.87 toks/s, output: 120.06 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:14<00:03, 2.27it/s, est. speed input: 767.21 toks/s, output: 134.31 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.85it/s, est. speed input: 823.86 toks/s, output: 150.42 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.17it/s, est. speed input: 873.34 toks/s, output: 165.90 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 2.26it/s, est. speed input: 922.66 toks/s, output: 191.57 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:00, 2.15it/s, est. speed input: 948.51 toks/s, output: 208.39 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:04, 4.85s/it, est. speed input: 486.19 toks/s, output: 161.47 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.78s/it, est. speed input: 513.20 toks/s, output: 222.06 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 40.02s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 42.47s
平均: 2.24s/页
============================================================
INFO: 127.0.0.1:54172 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:22:29 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:22:34 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:22:34 [config.py:721] This model supports multiple tasks: {'reward', 'classify', 'embed', 'generate', 'score'}. Defaulting to 'generate'.
INFO 02-04 13:22:34 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":128}, use_cached_outputs=False,
INFO 02-04 13:22:35 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:22:35 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:22:35 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:22:35 [worker_base.py:653] ########## 6517 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:22:35 [worker_base.py:654] ########## 6517 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:22:35.820861 6517 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:22:35.820928 6517 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:22:35.821431 6517 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:22:35.821447 6517 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:22:35.840910 6517 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:22:35.840948 6517 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:22:35.842164 6517 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:22:35.842182 6517 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:22:35.843191 6517 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55ac01ed57e0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:22:35.843209 6517 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:22:35 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:22:35 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:22:36 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128] is overridden by config [128, 1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.01it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.99it/s]
INFO 02-04 13:22:39 [loader.py:460] Loading weights took 1.97 seconds
INFO 02-04 13:22:39 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.414745 seconds
Some kwargs in processor config are unused and will not have any effect: image_mean, sft_format, add_special_token, downsample_ratio, image_token, pad_token, ignore_id, patch_size, candidate_resolutions, mask_prompt, normalize, image_std.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:22:53 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:22:54 [worker.py:287] Memory profiling takes 14.41 seconds
INFO 02-04 13:22:54 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.75) = 47.99GiB
INFO 02-04 13:22:54 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 38.60GiB.
INFO 02-04 13:22:54 [executor_base.py:112] # rocm blocks: 10541, # CPU blocks: 1092
INFO 02-04 13:22:54 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 82.35x
INFO 02-04 13:22:56 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/19 [00:00<?, ?it/s] Capturing CUDA graph shapes: 5%|▌ | 1/19 [00:00<00:09, 1.87it/s] Capturing CUDA graph shapes: 11%|█ | 2/19 [00:01<00:08, 1.94it/s] Capturing CUDA graph shapes: 16%|█▌ | 3/19 [00:01<00:08, 1.99it/s] Capturing CUDA graph shapes: 21%|██ | 4/19 [00:02<00:07, 1.99it/s] Capturing CUDA graph shapes: 26%|██▋ | 5/19 [00:02<00:06, 2.02it/s] Capturing CUDA graph shapes: 32%|███▏ | 6/19 [00:02<00:06, 2.06it/s] Capturing CUDA graph shapes: 37%|███▋ | 7/19 [00:03<00:05, 2.08it/s] Capturing CUDA graph shapes: 42%|████▏ | 8/19 [00:03<00:05, 2.10it/s] Capturing CUDA graph shapes: 47%|████▋ | 9/19 [00:04<00:04, 2.04it/s] Capturing CUDA graph shapes: 53%|█████▎ | 10/19 [00:04<00:04, 2.05it/s] Capturing CUDA graph shapes: 58%|█████▊ | 11/19 [00:05<00:03, 2.08it/s] Capturing CUDA graph shapes: 63%|██████▎ | 12/19 [00:05<00:03, 2.10it/s] Capturing CUDA graph shapes: 68%|██████▊ | 13/19 [00:06<00:02, 2.11it/s] Capturing CUDA graph shapes: 74%|███████▎ | 14/19 [00:06<00:02, 2.12it/s] Capturing CUDA graph shapes: 79%|███████▉ | 15/19 [00:07<00:01, 2.13it/s] Capturing CUDA graph shapes: 84%|████████▍ | 16/19 [00:07<00:01, 2.13it/s] Capturing CUDA graph shapes: 89%|████████▉ | 17/19 [00:08<00:00, 2.10it/s] Capturing CUDA graph shapes: 95%|█████████▍| 18/19 [00:08<00:00, 2.05it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.07it/s] Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:09<00:00, 2.07it/s]
INFO 02-04 13:23:05 [model_runner.py:1752] Graph capturing finished in 9 secs, took 0.16 GiB
INFO 02-04 13:23:05 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 26.07 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 16 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [6517]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: image_mean, sft_format, add_special_token, downsample_ratio, image_token, pad_token, ignore_id, patch_size, candidate_resolutions, mask_prompt, normalize, image_std.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:26, 8.12s/it, est. speed input: 112.38 toks/s, output: 2.22 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:19, 4.70s/it, est. speed input: 175.00 toks/s, output: 11.02 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:45, 2.85s/it, est. speed input: 247.35 toks/s, output: 21.04 toks/s] Processed prompts: 21%|██ | 4/19 [00:12<00:31, 2.09s/it, est. speed input: 304.23 toks/s, output: 31.99 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:19, 1.43s/it, est. speed input: 372.44 toks/s, output: 44.38 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:13, 1.04s/it, est. speed input: 436.28 toks/s, output: 56.94 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.31it/s, est. speed input: 501.41 toks/s, output: 70.06 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:12<00:04, 2.38it/s, est. speed input: 639.49 toks/s, output: 97.75 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.68it/s, est. speed input: 697.83 toks/s, output: 110.67 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:13<00:03, 2.08it/s, est. speed input: 724.43 toks/s, output: 120.82 toks/s] Processed prompts: 63%|██████▎ | 12/19 [00:14<00:03, 2.27it/s, est. speed input: 771.91 toks/s, output: 135.13 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.85it/s, est. speed input: 828.84 toks/s, output: 151.33 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.16it/s, est. speed input: 878.49 toks/s, output: 166.87 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 2.25it/s, est. speed input: 927.65 toks/s, output: 192.60 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:00, 2.15it/s, est. speed input: 953.21 toks/s, output: 209.42 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:04, 4.81s/it, est. speed input: 489.86 toks/s, output: 162.69 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.77s/it, est. speed input: 517.07 toks/s, output: 223.73 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 37.41s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 39.84s
平均: 2.10s/页
============================================================
INFO: 127.0.0.1:46140 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:27:14 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:27:19 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:27:19 [config.py:721] This model supports multiple tasks: {'classify', 'embed', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 02-04 13:27:19 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 13:27:19 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:27:19 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:27:19 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:27:19 [worker_base.py:653] ########## 7962 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:27:19 [worker_base.py:654] ########## 7962 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:27:19.934458 7962 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:27:19.934533 7962 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:27:19.934978 7962 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:27:19.934991 7962 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:27:19.954949 7962 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:27:19.954993 7962 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:27:19.956467 7962 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:27:19.956496 7962 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:27:19.957623 7962 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55c7b0a08000, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:27:19.957641 7962 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:27:19 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:27:19 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:27:20 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.01it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.99it/s]
INFO 02-04 13:27:23 [loader.py:460] Loading weights took 2.11 seconds
INFO 02-04 13:27:23 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.541007 seconds
Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, mask_prompt, pad_token, sft_format, image_std, image_mean, add_special_token, normalize, downsample_ratio, patch_size, ignore_id, image_token.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:27:36 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:27:36 [worker.py:287] Memory profiling takes 13.04 seconds
INFO 02-04 13:27:36 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.50) = 31.99GiB
INFO 02-04 13:27:36 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 22.76GiB.
INFO 02-04 13:27:37 [executor_base.py:112] # rocm blocks: 6214, # CPU blocks: 1092
INFO 02-04 13:27:37 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 121.21x
INFO 02-04 13:27:39 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:03, 1.33it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.53it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 1.61it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.62it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:03<00:00, 1.64it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.66it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.62it/s]
INFO 02-04 13:27:42 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
INFO 02-04 13:27:42 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.26 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 16 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [7962]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: candidate_resolutions, mask_prompt, pad_token, sft_format, image_std, image_mean, add_special_token, normalize, downsample_ratio, patch_size, ignore_id, image_token.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:27, 8.17s/it, est. speed input: 111.78 toks/s, output: 2.20 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:20, 4.73s/it, est. speed input: 174.14 toks/s, output: 10.97 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:45, 2.86s/it, est. speed input: 246.20 toks/s, output: 20.94 toks/s] Processed prompts: 21%|██ | 4/19 [00:11<00:28, 1.89s/it, est. speed input: 317.05 toks/s, output: 31.69 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.49s/it, est. speed input: 370.97 toks/s, output: 42.66 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:14, 1.09s/it, est. speed input: 434.44 toks/s, output: 55.20 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.26it/s, est. speed input: 499.34 toks/s, output: 68.29 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:12<00:04, 2.30it/s, est. speed input: 636.82 toks/s, output: 95.87 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.60it/s, est. speed input: 694.88 toks/s, output: 108.76 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:13<00:03, 2.03it/s, est. speed input: 721.22 toks/s, output: 118.92 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.86it/s, est. speed input: 832.75 toks/s, output: 149.44 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.39it/s, est. speed input: 889.48 toks/s, output: 165.55 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:14<00:01, 3.64it/s, est. speed input: 938.97 toks/s, output: 181.01 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 1.94it/s, est. speed input: 926.24 toks/s, output: 189.01 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:01, 1.93it/s, est. speed input: 952.48 toks/s, output: 206.07 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:05, 5.30s/it, est. speed input: 486.32 toks/s, output: 159.97 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.78s/it, est. speed input: 513.33 toks/s, output: 220.58 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 37.73s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 40.18s
平均: 2.11s/页
============================================================
INFO: 127.0.0.1:36984 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:34:49 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:34:54 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:34:54 [config.py:721] This model supports multiple tasks: {'embed', 'score', 'classify', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 02-04 13:34:54 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 13:34:55 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:34:55 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:34:55 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:34:55 [worker_base.py:653] ########## 9384 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:34:55 [worker_base.py:654] ########## 9384 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:34:55.404677 9384 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:34:55.404740 9384 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:34:55.405254 9384 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:34:55.405270 9384 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:34:55.424903 9384 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:34:55.424939 9384 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:34:55.426165 9384 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:34:55.426184 9384 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:34:55.427194 9384 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x564898a311e0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:34:55.427210 9384 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:34:55 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:34:55 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:34:56 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.02it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.00it/s]
INFO 02-04 13:34:58 [loader.py:460] Loading weights took 1.98 seconds
INFO 02-04 13:34:59 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.419236 seconds
Some kwargs in processor config are unused and will not have any effect: add_special_token, downsample_ratio, candidate_resolutions, mask_prompt, normalize, pad_token, image_std, image_token, sft_format, image_mean, patch_size, ignore_id.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:35:11 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:35:12 [worker.py:287] Memory profiling takes 12.98 seconds
INFO 02-04 13:35:12 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-04 13:35:12 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 48.35GiB.
INFO 02-04 13:35:12 [executor_base.py:112] # rocm blocks: 13203, # CPU blocks: 1092
INFO 02-04 13:35:12 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 257.54x
INFO 02-04 13:35:14 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:02, 1.90it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.97it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 2.00it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.97it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 1.98it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 2.03it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 2.00it/s]
INFO 02-04 13:35:17 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-04 13:35:17 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 18.56 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 8 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [9384]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: add_special_token, downsample_ratio, candidate_resolutions, mask_prompt, normalize, pad_token, image_std, image_token, sft_format, image_mean, patch_size, ignore_id.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:27, 8.19s/it, est. speed input: 111.54 toks/s, output: 2.20 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:20, 4.75s/it, est. speed input: 173.48 toks/s, output: 10.93 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:45, 2.87s/it, est. speed input: 245.39 toks/s, output: 20.87 toks/s] Processed prompts: 21%|██ | 4/19 [00:11<00:28, 1.89s/it, est. speed input: 316.08 toks/s, output: 31.59 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.49s/it, est. speed input: 370.15 toks/s, output: 42.57 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:14, 1.09s/it, est. speed input: 433.69 toks/s, output: 55.10 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.26it/s, est. speed input: 498.53 toks/s, output: 68.18 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:12<00:04, 2.30it/s, est. speed input: 635.85 toks/s, output: 95.72 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.61it/s, est. speed input: 693.86 toks/s, output: 108.60 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:13<00:03, 2.05it/s, est. speed input: 720.77 toks/s, output: 118.85 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.89it/s, est. speed input: 832.55 toks/s, output: 149.41 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.42it/s, est. speed input: 889.36 toks/s, output: 165.53 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:14<00:01, 3.67it/s, est. speed input: 938.84 toks/s, output: 180.98 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 1.95it/s, est. speed input: 926.49 toks/s, output: 189.07 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:01, 1.94it/s, est. speed input: 952.75 toks/s, output: 206.13 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:05, 5.32s/it, est. speed input: 485.27 toks/s, output: 159.63 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.78s/it, est. speed input: 512.22 toks/s, output: 220.10 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 37.72s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 40.16s
平均: 2.11s/页
============================================================
INFO: 127.0.0.1:33914 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 13:38:05 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 13:38:10 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 13:38:10 [config.py:721] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.
INFO 02-04 13:38:10 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=3281, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 13:38:10 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 13:38:10 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 13:38:10 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 13:38:10 [worker_base.py:653] ########## 11311 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 13:38:10 [worker_base.py:654] ########## 11311 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 13:38:11.160339 11311 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 13:38:11.160403 11311 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:38:11.160856 11311 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 13:38:11.160869 11311 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:38:11.180904 11311 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 13:38:11.180943 11311 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:38:11.182142 11311 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 13:38:11.182159 11311 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 13:38:11.183102 11311 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55af37e0dec0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 13:38:11.183123 11311 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 13:38:11 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 13:38:11 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 13:38:11 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.14it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.12it/s]
INFO 02-04 13:38:14 [loader.py:460] Loading weights took 1.97 seconds
INFO 02-04 13:38:14 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.406699 seconds
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, mask_prompt, add_special_token, ignore_id, normalize, image_std, image_token, patch_size, image_mean, sft_format, pad_token, candidate_resolutions.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 13:38:27 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 13:38:28 [worker.py:287] Memory profiling takes 13.04 seconds
INFO 02-04 13:38:28 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-04 13:38:28 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.45GiB; the rest of the memory reserved for KV Cache is 48.35GiB.
INFO 02-04 13:38:28 [executor_base.py:112] # rocm blocks: 13203, # CPU blocks: 1092
INFO 02-04 13:38:28 [executor_base.py:117] Maximum concurrency for 3281 tokens per request: 257.54x
INFO 02-04 13:38:30 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:03, 1.37it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.69it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 1.81it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.90it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 1.90it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.97it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.87it/s]
INFO 02-04 13:38:33 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-04 13:38:33 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 18.84 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 2 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [11311]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: downsample_ratio, mask_prompt, add_special_token, ignore_id, normalize, image_std, image_token, patch_size, image_mean, sft_format, pad_token, candidate_resolutions.
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:08<02:28, 8.28s/it, est. speed input: 110.31 toks/s, output: 2.17 toks/s] Processed prompts: 11%|█ | 2/19 [00:10<01:21, 4.79s/it, est. speed input: 171.92 toks/s, output: 10.83 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:11<00:46, 2.89s/it, est. speed input: 243.26 toks/s, output: 20.69 toks/s] Processed prompts: 21%|██ | 4/19 [00:11<00:28, 1.91s/it, est. speed input: 313.42 toks/s, output: 31.32 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:12<00:20, 1.50s/it, est. speed input: 367.24 toks/s, output: 42.23 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:12<00:14, 1.09s/it, est. speed input: 430.34 toks/s, output: 54.68 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:12<00:09, 1.25it/s, est. speed input: 494.64 toks/s, output: 67.64 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:13<00:04, 2.29it/s, est. speed input: 630.90 toks/s, output: 94.98 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:13<00:03, 2.60it/s, est. speed input: 688.67 toks/s, output: 107.79 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:14<00:03, 2.04it/s, est. speed input: 715.58 toks/s, output: 117.99 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:14<00:02, 2.88it/s, est. speed input: 826.58 toks/s, output: 148.34 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:14<00:01, 3.41it/s, est. speed input: 883.01 toks/s, output: 164.35 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:14<00:01, 3.67it/s, est. speed input: 932.42 toks/s, output: 179.74 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:15<00:01, 1.95it/s, est. speed input: 920.74 toks/s, output: 187.89 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:16<00:01, 1.94it/s, est. speed input: 947.26 toks/s, output: 204.94 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:33<00:05, 5.27s/it, est. speed input: 486.46 toks/s, output: 160.02 toks/s] Processed prompts: 100%|██████████| 19/19 [00:33<00:00, 1.78s/it, est. speed input: 513.48 toks/s, output: 220.64 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 38.49s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 40.93s
平均: 2.15s/页
============================================================
INFO: 127.0.0.1:56690 - "POST /ocr HTTP/1.1" 200 OK
[1/3] Tokenize 19 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 19 页...
Processed prompts: 0%| | 0/19 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▌ | 1/19 [00:05<01:41, 5.62s/it, est. speed input: 162.32 toks/s, output: 3.20 toks/s] Processed prompts: 11%|█ | 2/19 [00:07<01:02, 3.70s/it, est. speed input: 228.68 toks/s, output: 14.40 toks/s] Processed prompts: 16%|█▌ | 3/19 [00:08<00:36, 2.31s/it, est. speed input: 317.10 toks/s, output: 26.97 toks/s] Processed prompts: 21%|██ | 4/19 [00:09<00:23, 1.55s/it, est. speed input: 404.40 toks/s, output: 40.42 toks/s] Processed prompts: 26%|██▋ | 5/19 [00:09<00:17, 1.28s/it, est. speed input: 464.89 toks/s, output: 53.46 toks/s] Processed prompts: 32%|███▏ | 6/19 [00:10<00:12, 1.06it/s, est. speed input: 541.38 toks/s, output: 68.78 toks/s] Processed prompts: 37%|███▋ | 7/19 [00:10<00:08, 1.43it/s, est. speed input: 620.10 toks/s, output: 84.80 toks/s] Processed prompts: 47%|████▋ | 9/19 [00:10<00:03, 2.59it/s, est. speed input: 789.32 toks/s, output: 118.82 toks/s] Processed prompts: 53%|█████▎ | 10/19 [00:10<00:03, 2.88it/s, est. speed input: 857.64 toks/s, output: 134.23 toks/s] Processed prompts: 58%|█████▊ | 11/19 [00:11<00:03, 2.16it/s, est. speed input: 878.93 toks/s, output: 144.93 toks/s] Processed prompts: 68%|██████▊ | 13/19 [00:11<00:01, 3.01it/s, est. speed input: 1010.07 toks/s, output: 181.27 toks/s] Processed prompts: 74%|███████▎ | 14/19 [00:11<00:01, 3.55it/s, est. speed input: 1077.14 toks/s, output: 200.48 toks/s] Processed prompts: 79%|███████▉ | 15/19 [00:12<00:01, 3.79it/s, est. speed input: 1133.77 toks/s, output: 218.56 toks/s] Processed prompts: 84%|████████▍ | 16/19 [00:13<00:01, 1.97it/s, est. speed input: 1101.91 toks/s, output: 224.86 toks/s] Processed prompts: 89%|████████▉ | 17/19 [00:13<00:01, 1.96it/s, est. speed input: 1126.68 toks/s, output: 243.76 toks/s] Processed prompts: 95%|█████████▍| 18/19 [00:31<00:05, 5.26s/it, est. speed input: 527.63 toks/s, output: 173.56 toks/s] Processed prompts: 100%|██████████| 19/19 [00:31<00:00, 1.64s/it, est. speed input: 556.94 toks/s, output: 239.32 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 34.03s
[3/3] 后处理...
[3/3] 后处理完成 (0.00s)
============================================================
[SUCCESS] 全部完成
总耗时: 36.43s
平均: 1.92s/页
============================================================
INFO: 127.0.0.1:56556 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 15:40:52 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 15:40:57 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 15:40:57 [config.py:721] This model supports multiple tasks: {'generate', 'classify', 'embed', 'score', 'reward'}. Defaulting to 'generate'.
INFO 02-04 15:40:57 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 15:40:58 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 15:40:58 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 15:40:58 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 15:40:58 [worker_base.py:653] ########## 41027 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 15:40:58 [worker_base.py:654] ########## 41027 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 15:40:58.675812 41027 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 15:40:58.675896 41027 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 15:40:58.676411 41027 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 15:40:58.676433 41027 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 15:40:58.695976 41027 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 15:40:58.696019 41027 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 15:40:58.697266 41027 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 15:40:58.697289 41027 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 15:40:58.698205 41027 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55a4f8eebfa0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 15:40:58.698228 41027 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 15:40:58 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 15:40:58 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 15:40:59 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.93it/s]
INFO 02-04 15:41:02 [loader.py:460] Loading weights took 2.10 seconds
INFO 02-04 15:41:02 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.532530 seconds
Some kwargs in processor config are unused and will not have any effect: mask_prompt, sft_format, normalize, add_special_token, image_mean, candidate_resolutions, image_token, ignore_id, pad_token, patch_size, image_std, downsample_ratio.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 15:41:16 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 15:41:17 [worker.py:287] Memory profiling takes 14.39 seconds
INFO 02-04 15:41:17 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-04 15:41:17 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 48.20GiB.
INFO 02-04 15:41:17 [executor_base.py:112] # rocm blocks: 13162, # CPU blocks: 1092
INFO 02-04 15:41:17 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 102.83x
INFO 02-04 15:41:19 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:02, 1.93it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:00<00:01, 2.04it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:01<00:01, 2.00it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.99it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:02<00:00, 2.02it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.06it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:02<00:00, 2.03it/s]
INFO 02-04 15:41:22 [model_runner.py:1752] Graph capturing finished in 3 secs, took 0.12 GiB
INFO 02-04 15:41:22 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 19.95 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 2 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8707
[INFO] 接口文档: http://0.0.0.0:8707/docs
INFO: Started server process [41027]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8707 (Press CTRL+C to quit)
Some kwargs in processor config are unused and will not have any effect: mask_prompt, sft_format, normalize, add_special_token, image_mean, candidate_resolutions, image_token, ignore_id, pad_token, patch_size, image_std, downsample_ratio.
[1/3] Tokenize 22 页...
[1/3] Tokenize 完成
[2/3] GPU 批量推理 22 页...
Processed prompts: 0%| | 0/22 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s] Processed prompts: 5%|▍ | 1/22 [00:11<03:56, 11.26s/it, est. speed input: 81.10 toks/s, output: 9.42 toks/s] Processed prompts: 9%|▉ | 2/22 [00:11<01:36, 4.81s/it, est. speed input: 157.99 toks/s, output: 18.43 toks/s] Processed prompts: 14%|█▎ | 3/22 [00:16<01:32, 4.89s/it, est. speed input: 165.63 toks/s, output: 27.33 toks/s] Processed prompts: 18%|█▊ | 4/22 [00:17<00:57, 3.17s/it, est. speed input: 213.95 toks/s, output: 41.42 toks/s] Processed prompts: 23%|██▎ | 5/22 [00:18<00:44, 2.60s/it, est. speed input: 244.66 toks/s, output: 54.18 toks/s] Processed prompts: 27%|██▋ | 6/22 [00:25<01:06, 4.16s/it, est. speed input: 211.89 toks/s, output: 59.57 toks/s] Processed prompts: 32%|███▏ | 7/22 [00:27<00:49, 3.27s/it, est. speed input: 234.09 toks/s, output: 77.58 toks/s] Processed prompts: 36%|███▋ | 8/22 [00:27<00:31, 2.28s/it, est. speed input: 266.07 toks/s, output: 98.39 toks/s] Processed prompts: 41%|████ | 9/22 [00:32<00:41, 3.22s/it, est. speed input: 250.98 toks/s, output: 105.90 toks/s] Processed prompts: 45%|████▌ | 10/22 [00:34<00:33, 2.83s/it, est. speed input: 263.15 toks/s, output: 123.97 toks/s] Processed prompts: 50%|█████ | 11/22 [00:35<00:23, 2.15s/it, est. speed input: 284.47 toks/s, output: 146.07 toks/s] Processed prompts: 55%|█████▍ | 12/22 [00:35<00:16, 1.68s/it, est. speed input: 305.09 toks/s, output: 168.08 toks/s] Processed prompts: 59%|█████▉ | 13/22 [00:37<00:13, 1.52s/it, est. speed input: 320.18 toks/s, output: 187.75 toks/s] Processed prompts: 64%|██████▎ | 14/22 [00:39<00:15, 1.93s/it, est. speed input: 319.90 toks/s, output: 200.32 toks/s] Processed prompts: 68%|██████▊ | 15/22 [00:41<00:11, 1.68s/it, est. speed input: 333.73 toks/s, output: 221.64 toks/s] Processed prompts: 73%|███████▎ | 16/22 [00:41<00:08, 1.34s/it, est. speed input: 351.28 toks/s, output: 243.00 toks/s] Processed prompts: 77%|███████▋ | 17/22 [00:41<00:04, 1.03it/s, est. speed input: 372.11 toks/s, output: 269.16 toks/s] Processed prompts: 82%|████████▏ | 18/22 [00:41<00:03, 1.32it/s, est. speed input: 391.52 toks/s, output: 294.53 toks/s] Processed prompts: 86%|████████▋ | 19/22 [00:43<00:03, 1.07s/it, est. speed input: 396.42 toks/s, output: 311.04 toks/s] Processed prompts: 91%|█████████ | 20/22 [00:45<00:02, 1.38s/it, est. speed input: 398.24 toks/s, output: 324.74 toks/s] Processed prompts: 95%|█████████▌| 21/22 [00:51<00:02, 2.75s/it, est. speed input: 370.18 toks/s, output: 324.88 toks/s] Processed prompts: 100%|██████████| 22/22 [01:37<00:00, 15.73s/it, est. speed input: 205.39 toks/s, output: 246.51 toks/s] Processed prompts: 100%|██████████| 22/22 [01:37<00:00, 4.45s/it, est. speed input: 205.39 toks/s, output: 246.51 toks/s]
[2/3] GPU 推理完成
OCR 耗时: 104.02s
[3/3] 后处理...
[3/3] 后处理完成 (0.01s)
============================================================
[SUCCESS] 全部完成
总耗时: 115.46s
平均: 5.25s/页
============================================================
INFO: 127.0.0.1:43306 - "POST /ocr HTTP/1.1" 200 OK
INFO 02-04 17:02:43 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 17:02:48 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 17:02:48 [config.py:721] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 02-04 17:02:48 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 17:02:49 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 17:02:49 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 17:02:49 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 17:02:49 [worker_base.py:653] ########## 43881 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 17:02:49 [worker_base.py:654] ########## 43881 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 17:02:49.512413 43881 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 17:02:49.512501 43881 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:02:49.512948 43881 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 17:02:49.512964 43881 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:02:49.532961 43881 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 17:02:49.533003 43881 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:02:49.534246 43881 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 17:02:49.534269 43881 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:02:49.535202 43881 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x560c9077aaf0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 17:02:49.535223 43881 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 17:02:49 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 17:02:49 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 17:02:50 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 509, in <module>
[rank0]: main()
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 500, in main
[rank0]: initialize_model(args.model_path, args.gpu_id)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 270, in initialize_model
[rank0]: llm = LLM(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
[rank0]: return engine_cls.from_vllm_config(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
[rank0]: return cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 283, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 52, in __init__
[rank0]: self._init_executor()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
[rank0]: self.collective_rpc("load_model")
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 203, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1136, in load_model
[rank0]: self.model = get_model(vllm_config=self.vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
[rank0]: return loader.load_model(vllm_config=vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 454, in load_model
[rank0]: model = _initialize_model(vllm_config=vllm_config)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 133, in _initialize_model
[rank0]: return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 321, in __init__
[rank0]: self.language_model = init_vllm_registered_model(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 286, in init_vllm_registered_model
[rank0]: return _initialize_model(vllm_config=vllm_config, prefix=prefix)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 133, in _initialize_model
[rank0]: return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 457, in __init__
[rank0]: self.model = DeepseekModel(vllm_config=vllm_config,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 358, in __init__
[rank0]: self.start_layer, self.end_layer, self.layers = make_layers(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 609, in make_layers
[rank0]: [PPMissingLayer() for _ in range(start_layer)] + [
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/utils.py", line 610, in <listcomp>
[rank0]: maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 360, in <lambda>
[rank0]: lambda prefix: DeepseekDecoderLayer(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 300, in __init__
[rank0]: self.mlp = DeepseekMoE(config=config,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 120, in __init__
[rank0]: self.pack_params()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/deepseek.py", line 144, in pack_params
[rank0]: self.w1 = torch._utils._flatten_dense_tensors(w1)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 510, in _flatten_dense_tensors
[rank0]: return torch._C._nn.flatten_dense_tensors(tensors)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_device.py", line 79, in __torch_function__
[rank0]: return func(*args, **kwargs)
[rank0]: torch.OutOfMemoryError: HIP out of memory. Tried to allocate 280.00 MiB. GPU 0 has a total capacity of 63.98 GiB of which 0 bytes is free. Of the allocated memory 3.30 GiB is allocated by PyTorch, and 15.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_HIP_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
INFO 02-04 17:19:38 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 17:19:43 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 17:19:43 [config.py:721] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 02-04 17:19:43 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 17:19:43 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 17:19:43 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 17:19:43 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 17:19:43 [worker_base.py:653] ########## 44951 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 17:19:43 [worker_base.py:654] ########## 44951 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 17:19:43.998072 44951 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 17:19:43.998150 44951 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:19:43.998600 44951 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 17:19:43.998611 44951 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:19:44.017944 44951 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 17:19:44.017983 44951 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:19:44.019214 44951 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 17:19:44.019232 44951 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:19:44.020287 44951 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x55f9383f5ac0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 17:19:44.020305 44951 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 17:19:44 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 17:19:44 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 17:19:44 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.88it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 6.86it/s]
INFO 02-04 17:19:47 [loader.py:460] Loading weights took 2.11 seconds
INFO 02-04 17:19:47 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.538627 seconds
Some kwargs in processor config are unused and will not have any effect: add_special_token, pad_token, downsample_ratio, ignore_id, mask_prompt, image_std, image_mean, normalize, patch_size, image_token, sft_format, candidate_resolutions.
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 509, in <module>
[rank0]: main()
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 500, in main
[rank0]: initialize_model(args.model_path, args.gpu_id)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py", line 270, in initialize_model
[rank0]: llm = LLM(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 1182, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 255, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 520, in from_engine_args
[rank0]: return engine_cls.from_vllm_config(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 496, in from_vllm_config
[rank0]: return cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 286, in __init__
[rank0]: self._initialize_kv_caches()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 432, in _initialize_kv_caches
[rank0]: self.model_executor.determine_num_available_blocks())
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 103, in determine_num_available_blocks
[rank0]: results = self.collective_rpc("determine_num_available_blocks")
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/utils.py", line 2624, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 249, in determine_num_available_blocks
[rank0]: self.model_runner.profile_run()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1262, in profile_run
[rank0]: self._dummy_run(max_num_batched_tokens, max_num_seqs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1388, in _dummy_run
[rank0]: self.execute_model(model_input, kv_caches, intermediate_tensors)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 1948, in execute_model
[rank0]: hidden_or_intermediate_states = model_executable(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 543, in forward
[rank0]: vision_embeddings = self.get_multimodal_embeddings(**kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 503, in get_multimodal_embeddings
[rank0]: vision_embeddings = self._process_image_input(image_input)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 486, in _process_image_input
[rank0]: vision_features = self._pixel_values_to_embedding(
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepseek_ocr.py", line 394, in _pixel_values_to_embedding
[rank0]: local_features_1 = self.sam_model(patches)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 176, in forward
[rank0]: x = blk(x)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 241, in forward
[rank0]: x = self.attn(x)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py", line 294, in forward
[rank0]: qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py", line 117, in forward
[rank0]: return F.linear(input, self.weight, self.bias)
[rank0]: RuntimeError: CUDA error: HIPBLAS_STATUS_ALLOC_FAILED when calling `hipblasCreate(handle)`
INFO 02-04 17:20:27 [__init__.py:240] Automatically detected platform rocm.
/home/lst/DeepSeek-OCR-vllm/deepseek_ocr_server.py:472: DeprecationWarning:
on_event is deprecated, use lifespan event handlers instead.
Read more about it in the
[FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
@app.on_event("shutdown")
[INFO] 加载模型: /home/lst/deepseek_ocr
INFO 02-04 17:20:32 [config.py:460] Overriding HF config with {'architectures': ['DeepseekOCRForCausalLM']}
INFO 02-04 17:20:32 [config.py:721] This model supports multiple tasks: {'reward', 'embed', 'score', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 02-04 17:20:32 [llm_engine.py:244] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/home/lst/deepseek_ocr', speculative_config=None, tokenizer='/home/lst/deepseek_ocr', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=/home/lst/deepseek_ocr, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=True, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[24,16,8,4,2,1],"max_capture_size":24}, use_cached_outputs=False,
INFO 02-04 17:20:33 [rocm.py:226] None is not supported in AMD GPUs.
INFO 02-04 17:20:33 [rocm.py:227] Using ROCmFlashAttention backend.
WARNING 02-04 17:20:33 [worker_base.py:41] VLLM_RANK0_NUMA is unset or set incorrectly, vllm will not bind to numa! VLLM_RANK0_NUMA = -1
INFO 02-04 17:20:33 [worker_base.py:653] ########## 45841 process(rank0) is running on CPU(s): {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}
INFO 02-04 17:20:33 [worker_base.py:654] ########## 45841 process(rank0) is running on memnode(s): {0, 1, 2, 3}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I0204 17:20:33.298095 45841 ProcessGroupNCCL.cpp:881] [PG 0 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0, SPLIT_COLOR: 0, PG Name: 0
I0204 17:20:33.298182 45841 ProcessGroupNCCL.cpp:890] [PG 0 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:20:33.298664 45841 ProcessGroupNCCL.cpp:881] [PG 1 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 1
I0204 17:20:33.298677 45841 ProcessGroupNCCL.cpp:890] [PG 1 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:20:33.306862 45841 ProcessGroupNCCL.cpp:881] [PG 3 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 3
I0204 17:20:33.306901 45841 ProcessGroupNCCL.cpp:890] [PG 3 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:20:33.308358 45841 ProcessGroupNCCL.cpp:881] [PG 5 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 5
I0204 17:20:33.308387 45841 ProcessGroupNCCL.cpp:890] [PG 5 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
I0204 17:20:33.309455 45841 ProcessGroupNCCL.cpp:881] [PG 7 Rank 0] ProcessGroupNCCL initialization options: size: 1, global rank: 0, TIMEOUT(ms): 600000, USE_HIGH_PRIORITY_STREAM: 0, SPLIT_FROM: 0x5590ffe7c9d0, SPLIT_COLOR: 3389850942126204093, PG Name: 7
I0204 17:20:33.309473 45841 ProcessGroupNCCL.cpp:890] [PG 7 Rank 0] ProcessGroupNCCL environments: NCCL version: 2.18.3, TORCH_NCCL_ASYNC_ERROR_HANDLING: 3, TORCH_NCCL_DUMP_ON_TIMEOUT: 0, TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: 60000, TORCH_NCCL_DESYNC_DEBUG: 0, TORCH_NCCL_ENABLE_TIMING: 0, TORCH_NCCL_BLOCKING_WAIT: 0, TORCH_DISTRIBUTED_DEBUG: OFF, TORCH_NCCL_ENABLE_MONITORING: 1, TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: 600, TORCH_NCCL_TRACE_BUFFER_SIZE: 0, TORCH_NCCL_COORD_CHECK_MILSEC: 1000, TORCH_NCCL_NAN_CHECK: 0
INFO 02-04 17:20:33 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 02-04 17:20:33 [model_runner.py:1133] Starting to load model /home/lst/deepseek_ocr...
INFO 02-04 17:20:34 [config.py:3627] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24] is overridden by config [1, 2, 4, 8, 16, 24]
Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.08it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 7.06it/s]
INFO 02-04 17:20:36 [loader.py:460] Loading weights took 2.12 seconds
INFO 02-04 17:20:37 [model_runner.py:1165] Model loading took 6.2319 GiB and 3.572350 seconds
Some kwargs in processor config are unused and will not have any effect: normalize, sft_format, image_token, mask_prompt, ignore_id, downsample_ratio, patch_size, candidate_resolutions, image_mean, pad_token, add_special_token, image_std.
/home/lst/DeepSeek-OCR-vllm/deepencoder/sam_vary_sdpa.py:310: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:627.)
x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
WARNING 02-04 17:20:51 [fused_moe.py:882] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=K100_AI.json
INFO 02-04 17:20:51 [worker.py:287] Memory profiling takes 14.59 seconds
INFO 02-04 17:20:51 [worker.py:287] the current vLLM instance can use total_gpu_memory (63.98GiB) x gpu_memory_utilization (0.90) = 57.59GiB
INFO 02-04 17:20:51 [worker.py:287] model weights take 6.23GiB; non_torch_memory takes 1.55GiB; PyTorch activation peak memory takes 1.60GiB; the rest of the memory reserved for KV Cache is 48.20GiB.
INFO 02-04 17:20:52 [executor_base.py:112] # rocm blocks: 13162, # CPU blocks: 1092
INFO 02-04 17:20:52 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 102.83x
INFO 02-04 17:20:54 [model_runner.py:1523] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 0%| | 0/6 [00:00<?, ?it/s] Capturing CUDA graph shapes: 17%|█▋ | 1/6 [00:00<00:03, 1.46it/s] Capturing CUDA graph shapes: 33%|███▎ | 2/6 [00:01<00:02, 1.51it/s] Capturing CUDA graph shapes: 50%|█████ | 3/6 [00:02<00:02, 1.48it/s] Capturing CUDA graph shapes: 67%|██████▋ | 4/6 [00:02<00:01, 1.62it/s] Capturing CUDA graph shapes: 83%|████████▎ | 5/6 [00:03<00:00, 1.72it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.84it/s] Capturing CUDA graph shapes: 100%|██████████| 6/6 [00:03<00:00, 1.70it/s]
INFO 02-04 17:20:57 [model_runner.py:1752] Graph capturing finished in 4 secs, took 0.12 GiB
INFO 02-04 17:20:57 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 20.50 seconds
[SUCCESS] 模型加载完成
[INFO] 线程池配置:
- CPU 线程池: 2 线程
- GPU 线程池: 1 线程
[INFO] 服务启动: http://0.0.0.0:8708
[INFO] 接口文档: http://0.0.0.0:8708/docs
INFO: Started server process [45841]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8708 (Press CTRL+C to quit)
export VLLM_USE_V1=0
export HIP_VISIBLE_DEVICES=0
# image:流式输出
#python run_dpsk_ocr_image.py
# pdf
python run_dpsk_ocr_pdf.py
import requests
from pathlib import Path
from config import INPUT_PATH,OUTPUT_PATH
import os
def ocr_pdf(pdf_path, server_url="http://localhost:8002", save_result=True):
"""
对 PDF 文档进行 OCR 识别
参数:
pdf_path: PDF 文件路径
server_url: OCR 服务地址
save_result: 是否保存识别结果到文件
返回:
dict: 包含识别结果的字典
"""
# 1. 检查文件
pdf_file = Path(pdf_path)
if not pdf_file.exists():
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
if pdf_file.suffix.lower() != '.pdf':
raise ValueError(f"不是 PDF 文件: {pdf_path}")
# 2. 读取文件大小(用于显示)
file_size_mb = pdf_file.stat().st_size / (1024 * 1024)
print(f"文件名: {pdf_file.name}")
print(f"文件大小: {file_size_mb:.2f} MB")
print(f"开始处理...")
# 3. 准备请求
api_url = f"{server_url}/ocr"
# 4. 发送请求
with open(pdf_path, 'rb') as f:
files = {'file': (pdf_file.name, f, 'application/pdf')}
# 这里可以添加额外参数
# data = {'enable_description': True} # 启用图片描述(会增加处理时间)
response = requests.post(api_url, files=files)
# 5. 处理结果
if response.status_code == 200:
result = response.json()
print(f"处理完成!")
print(f"统计信息:")
print(f" - 总页数: {result['page_count']} 页")
print(f" - 处理耗时: {result['processing_time']:.2f} 秒")
print(f" - 平均速度: {result['processing_time'] / result['page_count']:.2f} 秒/页")
# 6. 保存结果到文件
if save_result:
os.makedirs(OUTPUT_PATH, exist_ok=True)
output_file = pdf_file.with_suffix('.md')
file_path = os.path.join(OUTPUT_PATH, output_file)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(result['markdown'])
print(f"结果已保存到: {output_file}")
return result
else:
print(f"处理失败: {response.status_code}")
print(f"错误信息: {response.text}")
return None
# 使用示例
# 替换为你的实际 PDF 路径
#pdf_file = "./doc/DeepSeek_OCR_paper_layouts.pdf"
pdf_file = INPUT_PATH
# 调用 OCR 函数
result = ocr_pdf(pdf_file)
if result:
# 显示部分识别结果
print("\n" + "="*60)
print("识别结果预览:")
print("="*60)
# 显示前 1000 个字符
preview_text = result['markdown'][:1000]
print(preview_text)
if len(result['markdown']) > 1000:
print("\n... (内容过长,已截断)")
print(f"\n完整内容共 {len(result['markdown'])} 个字符")
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment