compose.yaml 1.19 KB
Newer Older
QIN2DIM's avatar
QIN2DIM committed
1
# Documentation:
Xiaomeng Zhao's avatar
Xiaomeng Zhao committed
2
# https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands
3
4
5
6
7
8
9
10
11
12
13
14
15
services:
  mineru-sglang:
    image: mineru-sglang:latest
    container_name: mineru-sglang
    restart: always
    ports:
      - 30000:30000
    environment:
      MINERU_MODEL_SOURCE: local
    entrypoint: mineru-sglang-server
    command:
      --host 0.0.0.0
      --port 30000
Xiaomeng Zhao's avatar
Xiaomeng Zhao committed
16
17
18
19
      # --enable-torch-compile  # You can also enable torch.compile to accelerate inference speed by approximately 15%
      # --dp 2  # If you have more than two GPUs with 24GB VRAM or above, you can use sglang's multi-GPU parallel mode to increase throughput  
      # --tp 2  # If you have two GPUs with 12GB or 16GB VRAM, you can use the Tensor Parallel (TP) mode
      # --mem-fraction-static 0.7  # If you have two GPUs with 11GB VRAM, in addition to Tensor Parallel mode, you need to reduce the KV cache size
20
21
22
23
24
25
26
27
28
29
30
31
    ulimits:
      memlock: -1
      stack: 67108864
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["0"]
Xiaomeng Zhao's avatar
Xiaomeng Zhao committed
32
              capabilities: [gpu]