Merge pull request #2758 from QIN2DIM/compose-enable-torch-compile

feat: optimize SGLang server args

Merge pull request #2758 from QIN2DIM/compose-enable-torch-compile
feat: optimize SGLang server args
9e6256c5 · Xiaomeng Zhao · GitHub · 8a40ff3c · 3104bc2c · 9e6256c5
Unverified Commit 9e6256c5 authored Jun 24, 2025 by Xiaomeng Zhao Committed by GitHub Jun 24, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

docker/compose.yaml docker/compose.yaml +7 -1

No files found.
--- a/docker/compose.yaml
+++ b/docker/compose.yaml
+# Documentation:
+# https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands
 services:
  mineru-sglang:
    image: mineru-sglang:latest
@@ -11,6 +13,10 @@ services:
    command:
      --host 0.0.0.0
      --port 30000
+      # --enable-torch-compile  # You can also enable torch.compile to accelerate inference speed by approximately 15%
+      # --dp 2  # If you have more than two GPUs with 24GB VRAM or above, you can use sglang's multi-GPU parallel mode to increase throughput  
+      # --tp 2  # If you have two GPUs with 12GB or 16GB VRAM, you can use the Tensor Parallel (TP) mode
+      # --mem-fraction-static 0.7  # If you have two GPUs with 11GB VRAM, in addition to Tensor Parallel mode, you need to reduce the KV cache size
    ulimits:
      memlock: -1
      stack: 67108864
@@ -23,4 +29,4 @@ services:
          devices:
            - driver: nvidia
              device_ids: ["0"]
-              capabilities: [gpu]
\ No newline at end of file
+              capabilities: [gpu]