"vscode:/vscode.git/clone" did not exist on "6a9ac7271beed04fb85cfa99506618cdde36a48c"
Unverified Commit 55de40f7 authored by Shi Shuai's avatar Shi Shuai Committed by GitHub
Browse files

[Docs]: Fix Multi-User Port Allocation Conflicts (#3601)


Co-authored-by: default avatarzhaochenyang20 <zhaochen20@outlook.com>
Co-authored-by: default avatarsimveit <simp.veitner@gmail.com>
parent 6b0aeb58
...@@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \; ...@@ -36,42 +36,70 @@ find . -name '*.ipynb' -exec nbstripout {} \;
# After these checks pass, push your changes and open a PR on your branch # After these checks pass, push your changes and open a PR on your branch
pre-commit run --all-files pre-commit run --all-files
``` ```
---
### **Port Allocation and CI Efficiency**
If you need to run and shut down a SGLang server or engine, following these examples: **To launch and kill the server:**
1. Launch and close Sever:
```python ```python
#Launch Sever from sglang.test.test_utils import is_in_ci
from sglang.utils import wait_for_server, print_highlight, terminate_process
from sglang.utils import (
execute_shell_command, if is_in_ci():
wait_for_server, from patch import launch_server_cmd
terminate_process, else:
print_highlight, from sglang.utils import launch_server_cmd
server_process, port = launch_server_cmd(
"""
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
--host 0.0.0.0
"""
) )
server_process = execute_shell_command( wait_for_server(f"http://localhost:{port}")
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0"
)
wait_for_server("http://localhost:30000")
# Terminate Sever
# Terminate Server
terminate_process(server_process) terminate_process(server_process)
``` ```
2. Launch Engine and close Engine
**To launch and kill the engine:**
```python ```python
# Launch Engine # Launch Engine
import sglang as sgl import sglang as sgl
import asyncio import asyncio
from sglang.test.test_utils import is_in_ci
if is_in_ci():
import patch
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct") llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
# Terminalte Engine # Terminalte Engine
llm.shutdown() llm.shutdown()
``` ```
### **Why this approach?**
- **Dynamic Port Allocation**: Avoids port conflicts by selecting an available port at runtime, enabling multiple server instances to run in parallel.
- **Optimized for CI**: The `patch` version of `launch_server_cmd` and `sgl.Engine()` in CI environments helps manage GPU memory dynamically, preventing conflicts and improving test parallelism.
- **Better Parallel Execution**: Ensures smooth concurrent tests by avoiding fixed port collisions and optimizing memory usage.
### **Model Selection**
For demonstrations in the docs, **prefer smaller models** to reduce memory consumption and speed up inference. Running larger models in CI can lead to instability due to memory constraints.
### **Prompt Alignment Example**
When designing prompts, ensure they align with SGLang’s structured formatting. For example:
```python
prompt = """You are an AI assistant. Answer concisely and accurately.
User: What is the capital of France?
Assistant: The capital of France is Paris."""
```
This keeps responses aligned with expected behavior and improves reliability across different files.
...@@ -405,7 +405,7 @@ ...@@ -405,7 +405,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(server_process, port)" "terminate_process(server_process)"
] ]
}, },
{ {
......
...@@ -252,7 +252,7 @@ ...@@ -252,7 +252,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(server_process, port)\n", "terminate_process(server_process)\n",
"\n", "\n",
"embedding_process, port = launch_server_cmd(\n", "embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
...@@ -286,7 +286,7 @@ ...@@ -286,7 +286,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(embedding_process, port)" "terminate_process(embedding_process)"
] ]
}, },
{ {
...@@ -304,7 +304,7 @@ ...@@ -304,7 +304,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(embedding_process, port)\n", "terminate_process(embedding_process)\n",
"\n", "\n",
"# Note that SGLang now treats embedding models and reward models as the same type of models.\n", "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
"# This will be updated in the future.\n", "# This will be updated in the future.\n",
...@@ -355,7 +355,7 @@ ...@@ -355,7 +355,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(reward_process, port)" "terminate_process(reward_process)"
] ]
}, },
{ {
...@@ -425,7 +425,7 @@ ...@@ -425,7 +425,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(tokenizer_free_server_process, port)" "terminate_process(tokenizer_free_server_process)"
] ]
} }
], ],
......
...@@ -512,7 +512,7 @@ ...@@ -512,7 +512,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(server_process, port)" "terminate_process(server_process)"
] ]
} }
], ],
......
...@@ -169,7 +169,7 @@ ...@@ -169,7 +169,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(embedding_process, port)" "terminate_process(embedding_process)"
] ]
} }
], ],
......
...@@ -243,7 +243,7 @@ ...@@ -243,7 +243,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(embedding_process, port)" "terminate_process(embedding_process)"
] ]
}, },
{ {
......
import os import os
import weakref
from sglang.utils import execute_shell_command, reserve_port from sglang.utils import execute_shell_command, reserve_port
...@@ -21,15 +22,29 @@ def patched_post_init(self): ...@@ -21,15 +22,29 @@ def patched_post_init(self):
server_args_mod.ServerArgs.__post_init__ = patched_post_init server_args_mod.ServerArgs.__post_init__ = patched_post_init
process_socket_map = weakref.WeakKeyDictionary()
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None): def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
"""
Launch the server using the given command.
If no port is specified, a free port is reserved.
"""
if port is None: if port is None:
port = reserve_port() port, lock_socket = reserve_port(host)
else:
lock_socket = None
extra_flags = ( extra_flags = (
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} " f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} " f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
f"--disable-cuda-graph" f"--disable-cuda-graph"
) )
full_command = f"{command} --port {port} {extra_flags}" full_command = f"{command} --port {port} {extra_flags}"
process = execute_shell_command(full_command) process = execute_shell_command(full_command)
if lock_socket is not None:
process_socket_map[process] = lock_socket
return process, port return process, port
...@@ -243,15 +243,8 @@ ...@@ -243,15 +243,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(server_process, port)" "terminate_process(server_process)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
......
...@@ -397,7 +397,7 @@ ...@@ -397,7 +397,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"terminate_process(server_process, port)" "terminate_process(server_process)"
] ]
}, },
{ {
......
SGLang Documentation SGLang Documentation
==================================== ====================
SGLang is a fast serving framework for large language models and vision language models. SGLang is a fast serving framework for large language models and vision language models.
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language. It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
...@@ -10,7 +10,6 @@ The core features include: ...@@ -10,7 +10,6 @@ The core features include:
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models. - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption. - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: Getting Started :caption: Getting Started
...@@ -39,7 +38,6 @@ The core features include: ...@@ -39,7 +38,6 @@ The core features include:
frontend/frontend.md frontend/frontend.md
frontend/choices_methods.md frontend/choices_methods.md
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: SGLang Router :caption: SGLang Router
...@@ -47,24 +45,47 @@ The core features include: ...@@ -47,24 +45,47 @@ The core features include:
router/router.md router/router.md
References
==========
General
---------------------
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: References
references/supported_models.md references/supported_models.md
references/sampling_params.md references/contribution_guide.md
references/hyperparameter_tuning.md references/troubleshooting.md
references/benchmark_and_profiling.md references/faq.md
references/accuracy_evaluation.md references/learn_more.md
references/custom_chat_template.md
Hardware
--------------------------
.. toctree::
:maxdepth: 1
references/AMD.md
references/amd_configure.md references/amd_configure.md
references/nvidia_jetson.md
Advanced Models & Deployment
------------------------------
.. toctree::
:maxdepth: 1
references/deepseek.md references/deepseek.md
references/multi_node.md references/multi_node.md
references/multi_node_inference_k8s_lws.md references/multi_node_inference_k8s_lws.md
references/modelscope.md references/modelscope.md
Performance & Tuning
--------------------
.. toctree::
:maxdepth: 1
references/sampling_params.md
references/hyperparameter_tuning.md
references/benchmark_and_profiling.md
references/accuracy_evaluation.md
references/custom_chat_template.md
references/quantization.md references/quantization.md
references/contribution_guide.md
references/troubleshooting.md
references/nvidia_jetson.md
references/faq.md
references/learn_more.md
...@@ -2,19 +2,27 @@ ...@@ -2,19 +2,27 @@
You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems. You can install SGLang using any of the methods below. For running DeepSeek V3/R1 with SGLang, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is always recommended to use the [latest release version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid fixed issues and environment-related problems.
## Method 1: With pip ## Method 1: With pip or uv
```
We recommend using uv to install the dependencies with a higher installation speed:
```bash
pip install --upgrade pip pip install --upgrade pip
pip install sgl-kernel --force-reinstall --no-deps pip install uv
pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python uv pip install sgl-kernel --force-reinstall --no-deps
uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
``` ```
Note: SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`. **Quick Fix to Installation**
- SGLang currently uses torch 2.5, so you need to install the flashinfer version for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the package currently used by FlashInfer is named `flashinfer-python`, not `flashinfer`.
- If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
If you experience an error like `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions: 1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
2. Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
- Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable. - If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
- Follow the procedure described in [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) first, then install SGLang as described above.
## Method 2: From source ## Method 2: From source
``` ```
......
...@@ -5,12 +5,15 @@ import importlib ...@@ -5,12 +5,15 @@ import importlib
import json import json
import logging import logging
import os import os
import random
import signal import signal
import socket
import subprocess import subprocess
import sys import sys
import time import time
import traceback import traceback
import urllib.request import urllib.request
import weakref
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from io import BytesIO from io import BytesIO
from json import dumps from json import dumps
...@@ -21,6 +24,8 @@ import requests ...@@ -21,6 +24,8 @@ import requests
from IPython.display import HTML, display from IPython.display import HTML, display
from tqdm import tqdm from tqdm import tqdm
from sglang.srt.utils import kill_process_tree
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None): ...@@ -306,27 +311,12 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
return filename return filename
import fcntl
def is_in_ci(): def is_in_ci():
from sglang.test.test_utils import is_in_ci from sglang.test.test_utils import is_in_ci
return is_in_ci() return is_in_ci()
LOCKFILE = os.path.expanduser("~/.sglang_port_lock")
PORT_REGISTRY = os.path.expanduser("~/.sglang_port_registry.json")
if not os.path.exists(LOCKFILE):
with open(LOCKFILE, "w") as f:
pass
if not os.path.exists(PORT_REGISTRY):
with open(PORT_REGISTRY, "w") as f:
json.dump([], f)
def print_highlight(html_content: str): def print_highlight(html_content: str):
if is_in_ci(): if is_in_ci():
html_content = str(html_content).replace("\n", "<br>") html_content = str(html_content).replace("\n", "<br>")
...@@ -335,55 +325,44 @@ def print_highlight(html_content: str): ...@@ -335,55 +325,44 @@ def print_highlight(html_content: str):
print(html_content) print(html_content)
def init_port_registry(): process_socket_map = weakref.WeakKeyDictionary()
"""Initialize the port registry file if it doesn't exist."""
if not os.path.exists(PORT_REGISTRY):
with open(PORT_REGISTRY, "w") as f:
json.dump([], f)
def reserve_port(start=30000, end=40000): def reserve_port(host, start=30000, end=40000):
""" """
Reserve an available port using a file lock and a registry. Reserve an available port by trying to bind a socket.
Returns the allocated port. Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
""" """
init_port_registry() candidates = list(range(start, end))
with open(LOCKFILE, "w") as lock: random.shuffle(candidates)
fcntl.flock(lock, fcntl.LOCK_EX)
try: for port in candidates:
with open(PORT_REGISTRY, "r") as f: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
used = json.load(f) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
except Exception:
used = []
for port in range(start, end):
if port not in used:
used.append(port)
with open(PORT_REGISTRY, "w") as f:
json.dump(used, f)
return port
raise RuntimeError("No free port available")
def release_port(port):
"""Release the reserved port by removing it from the registry."""
with open(LOCKFILE, "w") as lock:
fcntl.flock(lock, fcntl.LOCK_EX)
try: try:
with open(PORT_REGISTRY, "r") as f: # Attempt to bind to the port on localhost
used = json.load(f) sock.bind((host, port))
except Exception: return port, sock
used = [] except socket.error:
if port in used: sock.close() # Failed to bind, try next port
used.remove(port) continue
with open(PORT_REGISTRY, "w") as f: raise RuntimeError("No free port available.")
json.dump(used, f)
def release_port(lock_socket):
"""
Release the reserved port by closing the lock socket.
"""
try:
lock_socket.close()
except Exception as e:
print(f"Error closing socket: {e}")
def execute_shell_command(command: str) -> subprocess.Popen: def execute_shell_command(command: str) -> subprocess.Popen:
""" """
Execute a shell command and return its process handle. Execute a shell command and return its process handle.
""" """
# Replace newline continuations and split the command string.
command = command.replace("\\\n", " ").replace("\\", " ") command = command.replace("\\\n", " ").replace("\\", " ")
parts = command.split() parts = command.split()
return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT) return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
...@@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None): ...@@ -395,21 +374,28 @@ def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
If no port is specified, a free port is reserved. If no port is specified, a free port is reserved.
""" """
if port is None: if port is None:
port = reserve_port() port, lock_socket = reserve_port(host)
else:
lock_socket = None
full_command = f"{command} --port {port}" full_command = f"{command} --port {port}"
process = execute_shell_command(full_command) process = execute_shell_command(full_command)
if lock_socket is not None:
process_socket_map[process] = lock_socket
return process, port return process, port
def terminate_process(process, port=None): def terminate_process(process):
""" """
Terminate the process and, if a port was reserved, release it. Terminate the process and automatically release the reserved port.
""" """
from sglang.srt.utils import kill_process_tree
kill_process_tree(process.pid) kill_process_tree(process.pid)
if port is not None:
release_port(port) lock_socket = process_socket_map.pop(process, None)
if lock_socket is not None:
release_port(lock_socket)
def wait_for_server(base_url: str, timeout: int = None) -> None: def wait_for_server(base_url: str, timeout: int = None) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment