Unverified Commit 935cda94 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Misc clean up; Remove the support of jump forward (#4032)

parent 110e0066
...@@ -385,7 +385,7 @@ ...@@ -385,7 +385,7 @@
"print(gen_response)\n", "print(gen_response)\n",
"\n", "\n",
"# parse the response\n", "# parse the response\n",
"parse_url = f\"http://localhost:{port}/function_call\"\n", "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
"\n", "\n",
"function_call_input = {\n", "function_call_input = {\n",
" \"text\": gen_response,\n", " \"text\": gen_response,\n",
......
# Sampling Parameters # Sampling Parameters in SGLang Runtime
This doc describes the sampling parameters of the SGLang Runtime. This doc describes the sampling parameters of the SGLang Runtime.
It is the low-level endpoint of the runtime. It is the low-level endpoint of the runtime.
If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](https://docs.sglang.ai/backend/openai_api_completions.html). If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](../backend/openai_api_completions.ipynb).
## `/generate` Endpoint The `/generate` endpoint accepts the following arguments in the JSON format. You can code examples below.
The `/generate` endpoint accepts the following parameters in JSON format. For in detail usage see the [native api doc](https://docs.sglang.ai/backend/native_api.html). ```python
@dataclass
class GenerateReqInput:
# The input prompt. It can be a single prompt or a batch of prompts.
text: Optional[Union[List[str], str]] = None
# The token ids for text; one can specify either text or input_ids
input_ids: Optional[Union[List[List[int]], List[int]]] = None
# The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
# The image input. It can be a file name, a url, or base64 encoded string.
# See also python/sglang/srt/utils.py:load_image.
image_data: Optional[Union[List[str], str]] = None
# The sampling_params. See descriptions below.
sampling_params: Optional[Union[List[Dict], Dict]] = None
# The request id.
rid: Optional[Union[List[str], str]] = None
# Whether to return logprobs.
return_logprob: Optional[Union[List[bool], bool]] = None
# If return logprobs, the start location in the prompt for returning logprobs.
# By default, this value is "-1", which means it will only return logprobs for output tokens.
logprob_start_len: Optional[Union[List[int], int]] = None
# If return logprobs, the number of top logprobs to return at each position.
top_logprobs_num: Optional[Union[List[int], int]] = None
# If return logprobs, the token ids to return logprob for.
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
# Whether to detokenize tokens in text in the returned logprobs.
return_text_in_logprobs: bool = False
# Whether to stream output.
stream: bool = False
* `prompt`: The input prompt. Can be a single prompt or a batch of prompts. `Optional[Union[List[str], str]] = None` # The modalities of the image data [image, multi-images, video]
* `input_ids`: Alternative to `text`. Specify the input as token IDs instead of text. `Optional[Union[List[List[int]], List[int]]] = None` modalities: Optional[List[str]] = None
* `sampling_params`: The sampling parameters as described in the sections below. `Optional[Union[List[Dict], Dict]] = None` # LoRA related
* `return_logprob`: Whether to return log probabilities for tokens. `Optional[Union[List[bool], bool]] = None` lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
* `logprob_start_len`: If returning log probabilities, specifies the start position in the prompt. Default is "-1" which returns logprobs only for output tokens. `Optional[Union[List[int], int]] = None`
* `top_logprobs_num`: If returning log probabilities, specifies the number of top logprobs to return at each position. `Optional[Union[List[int], int]] = None`
* `stream`: Whether to stream the output. `bool = False`
* `lora_path`: Path to LoRA weights. `Optional[Union[List[Optional[str]], Optional[str]]] = None`
* `custom_logit_processor`: Custom logit processor for advanced sampling control. For usage see below. `Optional[Union[List[Optional[str]], str]] = None`
* `return_hidden_states`: Whether to return hidden states of the model. Note that each time it changes, the cuda graph will be recaptured, which might lead to a performance hit. See the [examples](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/hidden_states.py) for more information. `bool = False`
## Sampling params # Custom logit processor for advanced sampling control. Must be a serialized instance
# of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
# Use the processor's `to_str()` method to generate the serialized string.
custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
### Core Parameters # Whether to return hidden states
return_hidden_states: bool = False
```
* `max_new_tokens`: The maximum output length measured in tokens. `int = 128` The `sampling_params` follows this format
* `stop`: One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. `Optional[Union[str, List[str]]] = None`
* `stop_token_ids`: Provide stop words in form of token ids. Generation will stop if one of these token ids is sampled. `Optional[List[int]] = []`
* `temperature`: [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, higher temperature leads to more diversity. `float = 1.0`
* `top_p`: [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. `top_p: float = 1.0`
* `top_k`: [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. `int = -1`
* `min_p`: [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. `float = 0.0`
### Penalizers ```python
# The maximum number of output tokens
max_new_tokens: int = 128,
# Stop when hitting any of the strings in this list
stop: Optional[Union[str, List[str]]] = None,
# Stop when hitting any of the token_ids in this list
stop_token_ids: Optional[List[int]] = [],
# Sampling temperature
temperature: float = 1.0,
# Top-p sampling
top_p: float = 1.0,
# Top-k sampling
top_k: int = -1,
# Min-p sampling
min_p: float = 0.0,
# Do parallel sampling and return `n` outputs.
n: int = 1,
To use penalizers you will need to `--disable-overlap`. Please note that this might degrade performance. ## Structured Outputs
# Only one of the below three can be set for a request.
* `frequency_penalty`: Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. `float = 0.0` # Constrain the output to follow a given JSON schema.
* `presence_penalty`: Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. `float = 0.0` json_schema: Optional[str] = None,
* `repetition_penalty`: Penalizes tokens if they appeared in prompt or generation so far. Must be between `0` and `2` where numbers smaller than `1` encourage repeatment of tokens and numbers larger than `2` encourages sampling of new tokens. The penalization scales multiplicatively. `float = 0.0` # Constrain the output to follow a given regular expression.
* `min_new_tokens`: Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior for example if the distribution is highly skewed towards these tokens. `int = 0` regex: Optional[str] = None,
# Constrain the output to follow a given EBNF grammar.
ebnf: Optional[str] = None,
### Constrained decoding ## Penalties
Please refer to our dedicated guide on [constrained decoding](https://docs.sglang.ai/backend/structured_outputs.html#Native-API-and-SGLang-Runtime-(SRT)) for the following parameters. # Float that penalizes new tokens based on their frequency in the generated text so far.
# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to
# repeat tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
frequency_penalty: float = 0.0,
# Float that penalizes new tokens based on whether they appear in the generated text so far.
# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat
# tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
presence_penalty: float = 0.0,
# Guides inference to generate at least this number of tokens by penalizing logits of tokenizer's
# EOS token and `stop_token_ids` to -inf, until the output token reaches given length.
# Note that any of the `stop` string can be generated before reaching `min_new_tokens`, as it is
# difficult to infer the correct token ID by given `stop` strings.
# Must be 0 <= value < max_new_tokens. Setting to 0 (default) will disable this penalty.
min_new_tokens: int = 0,
* `json_schema`: `Optional[str] = None` # Whether to ignore EOS token
* `regex`: `Optional[str] = None` ignore_eos: bool = False,
* `ebnf`: `Optional[str] = None` # Whether to skip the special tokens during detokenization
skip_special_tokens: bool = True,
# Whether to add spaces between special tokens during detokenization
spaces_between_special_tokens: bool = True,
### Other options ## Custom Parameters for Custom Logit Processor.
# A dictionary of custom parameters for the custom logit processor.
# The custom logit processor takes a list of dictionaries as input, where each
# dictionary is the custom parameters for one token in a batch of the input.
# See also python/sglang/srt/sampling/custom_logit_processor.py
custom_params: Optional[Dict[str, Any]] = None,
```
* `n`: Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeat the same prompts for several times offer better control and efficiency.) `int = 1` ## Examples
* `spaces_between_special_tokens`: Whether or not to add spaces between special tokens during detokenization. `bool = True`
* `no_stop_trim`: Don't trim stop words or EOS token from the generated text. `bool = False`
* `ignore_eos`: Don't stop generation when EOS token is sampled. `bool = False`
* `skip_special_tokens`: Remove special tokens during decoding. `bool = True`
* `custom_params`: Used when employing `CustomLogitProcessor`. For usage see below. `Optional[List[Optional[Dict[str, Any]]]] = None`
### Custom Logit Processor ### Normal
Launch a server
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
```
Launch a server with `--enable-custom-logit-processor` flag on. Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
### Streaming
Send a request and stream the output
```python
import requests, json
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
"stream": True,
},
stream=True,
)
prev = 0
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data: [DONE]":
break
data = json.loads(chunk[5:].strip("\n"))
output = data["text"].strip()
print(output[prev:], end="", flush=True)
prev = len(output)
print("")
```
### Multi modal
Launch a server
```
python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava
```
Download an image
```
curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
```
Send a request
```python
import requests
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
"<|im_start|>assistant\n",
"image_data": "example_image.png",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print(response.json())
```
The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
Streaming is supported in a similar manner as [above](#streaming).
### Structured Outputs (JSON, Regex, EBNF)
You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
SGLang supports two grammar backends:
- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.
- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.
- XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)
Initialize the XGrammar backend using `--grammar-backend xgrammar` flag
```bash
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: outlines)
```
```python
import json
import requests
json_schema = json.dumps({
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
})
# JSON (works with both Outlines and XGrammar)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Here is the information of the capital of France in the JSON format.\n",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"json_schema": json_schema,
},
},
)
print(response.json())
# Regular expression (Outlines backend only)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Paris is the capital of",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"regex": "(France|England)",
},
},
)
print(response.json())
# EBNF (XGrammar backend only)
response = requests.post(
"http://localhost:30000/generate",
json={
"text": "Write a greeting.",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
},
},
)
print(response.json())
```
### Custom Logit Processor
Launch a server with `--enable-custom-logit-processor` flag on.
``` ```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
``` ```
Define a custom logit processor that will always sample a specific token id. Define a custom logit processor that will always sample a specific token id.
```python ```python
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
...@@ -89,7 +301,6 @@ class DeterministicLogitProcessor(CustomLogitProcessor): ...@@ -89,7 +301,6 @@ class DeterministicLogitProcessor(CustomLogitProcessor):
``` ```
Send a request Send a request
```python ```python
import requests import requests
......
...@@ -68,7 +68,7 @@ Please consult the documentation below to learn more about the parameters you ma ...@@ -68,7 +68,7 @@ Please consult the documentation below to learn more about the parameters you ma
### API configuration ### API configuration
* `api_key`: Sets an API key for the server and the OpenAI-compatible API. * `api_key`: Sets an API key for the server and the OpenAI-compatible API.
* `file_storage_pth`: Directory for storing uploaded or generated files from API calls. * `file_storage_path`: Directory for storing uploaded or generated files from API calls.
* `enable_cache_report`: If set, includes detailed usage of cached tokens in the response usage. * `enable_cache_report`: If set, includes detailed usage of cached tokens in the response usage.
## Parallelism ## Parallelism
...@@ -162,7 +162,6 @@ Please consult the documentation below to learn more about the parameters you ma ...@@ -162,7 +162,6 @@ Please consult the documentation below to learn more about the parameters you ma
*Note: We recommend to stay with the defaults and only use these options for debugging for best possible performance.* *Note: We recommend to stay with the defaults and only use these options for debugging for best possible performance.*
* `disable_radix_cache`: Disable [Radix](https://lmsys.org/blog/2024-01-17-sglang/) backend for prefix caching. * `disable_radix_cache`: Disable [Radix](https://lmsys.org/blog/2024-01-17-sglang/) backend for prefix caching.
* `disable_jump_forward`: Disable [jump-forward](https://lmsys.org/blog/2024-02-05-compressed-fsm/#our-method-jump-forward-decoding-with-a-compressed-finite-state-machine) for outlines grammar backend.
* `disable_cuda_graph`: Disable [cuda graph](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) for model forward. Use if encountering uncorrectable CUDA ECC errors. * `disable_cuda_graph`: Disable [cuda graph](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) for model forward. Use if encountering uncorrectable CUDA ECC errors.
* `disable_cuda_graph_padding`: Disable cuda graph when padding is needed. In other case still use cuda graph. * `disable_cuda_graph_padding`: Disable cuda graph when padding is needed. In other case still use cuda graph.
* `disable_outlines_disk_cache`: Disable disk cache for outlines grammar backend. * `disable_outlines_disk_cache`: Disable disk cache for outlines grammar backend.
......
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
...@@ -104,7 +104,7 @@ ...@@ -104,7 +104,7 @@
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --cuda-graph-max-bs 2\n", " --enable-torch-compile --cuda-graph-max-bs 2\n",
"\"\"\"\n", "\"\"\"\n",
...@@ -175,7 +175,7 @@ ...@@ -175,7 +175,7 @@
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n", " --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n",
"\"\"\"\n", "\"\"\"\n",
......
...@@ -43,4 +43,4 @@ If you want to contribute but don’t have a specific idea in mind, pick issues ...@@ -43,4 +43,4 @@ If you want to contribute but don’t have a specific idea in mind, pick issues
If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw). If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw).
Thank you for your interest in SGLang**happy coding**! Thank you for your interest in SGLang. Happy coding!
...@@ -71,7 +71,7 @@ srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \ ...@@ -71,7 +71,7 @@ srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
--model-path "$model" \ --model-path "$model" \
--grammar-backend "xgrammar" \ --grammar-backend "xgrammar" \
--tp "$tp_size" \ --tp "$tp_size" \
--nccl-init-addr "$NCCL_INIT_ADDR" \ --dist-init-addr "$NCCL_INIT_ADDR" \
--nnodes 2 \ --nnodes 2 \
--node-rank "$SLURM_NODEID" & --node-rank "$SLURM_NODEID" &
......
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
You can install SGLang using any of the methods below. You can install SGLang using any of the methods below.
For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the [latest version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related problems. For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the latest version and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related issues.
It is recommended to use uv to install the dependencies for faster installation:
We recommend using uv to install the dependencies with a higher installation speed:
## Method 1: With pip or uv ## Method 1: With pip or uv
```bash ```bash
...@@ -13,14 +14,13 @@ pip install uv ...@@ -13,14 +14,13 @@ pip install uv
uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
``` ```
**Quick Fixes to Installation** **Quick Fixes to Common Problems**
- SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the FlashInfer pypi package is called `flashinfer-python` instead of `flashinfer`. - SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the FlashInfer pypi package is called `flashinfer-python` instead of `flashinfer`.
- If you encounter `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions: - If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable. 2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
- If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`. - If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.
......
...@@ -17,7 +17,7 @@ def main(): ...@@ -17,7 +17,7 @@ def main():
llm = sgl.Engine( llm = sgl.Engine(
model_path="meta-llama/Llama-2-7b-chat-hf", model_path="meta-llama/Llama-2-7b-chat-hf",
speculative_algorithm="EAGLE", speculative_algorithm="EAGLE",
speculative_draft_model_path="lmzheng/sglang-EAGLE-llama2-chat-7B", speculative_draft_model_path="lmsys/sglang-EAGLE-llama2-chat-7B",
speculative_num_steps=3, speculative_num_steps=3,
speculative_eagle_topk=4, speculative_eagle_topk=4,
speculative_num_draft_tokens=16, speculative_num_draft_tokens=16,
......
...@@ -52,7 +52,7 @@ srt = [ ...@@ -52,7 +52,7 @@ srt = [
# HIP (Heterogeneous-computing Interface for Portability) for AMD # HIP (Heterogeneous-computing Interface for Portability) for AMD
# => base docker rocm/vllm-dev:20241022, not from public vllm whl # => base docker rocm/vllm-dev:20241022, not from public vllm whl
srt_hip = ["sglang[runtime_common]", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"] srt_hip = ["sglang[runtime_common]", "sgl-kernel==0.0.3.post6", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
# xpu is not enabled in public vllm and torch whl, # xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
......
...@@ -12,6 +12,5 @@ ...@@ -12,6 +12,5 @@
- `global_config.py`: The global configs and constants. - `global_config.py`: The global configs and constants.
- `launch_server.py`: The entry point for launching the local server. - `launch_server.py`: The entry point for launching the local server.
- `llama3_eval.py`: Evaluation of Llama 3 using the Meta Llama dataset. - `llama3_eval.py`: Evaluation of Llama 3 using the Meta Llama dataset.
- `profiler.py`: Profile a running server.
- `utils.py`: Common utilities. - `utils.py`: Common utilities.
- `version.py`: Version info. - `version.py`: Version info.
raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
...@@ -4,6 +4,13 @@ import os ...@@ -4,6 +4,13 @@ import os
class GlobalConfig: class GlobalConfig:
"""
Store some global constants.
See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
many global runtime arguments as well.
"""
def __init__(self): def __init__(self):
# Verbosity level # Verbosity level
# 0: do not output anything # 0: do not output anything
......
...@@ -80,7 +80,6 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size): ...@@ -80,7 +80,6 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
grammar_backend = OutlinesGrammarBackend( grammar_backend = OutlinesGrammarBackend(
tokenizer, tokenizer,
whitespace_pattern=server_args.constrained_json_whitespace_pattern, whitespace_pattern=server_args.constrained_json_whitespace_pattern,
allow_jump_forward=not server_args.disable_jump_forward,
) )
elif server_args.grammar_backend == "xgrammar": elif server_args.grammar_backend == "xgrammar":
from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
......
...@@ -115,7 +115,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend): ...@@ -115,7 +115,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
self, self,
tokenizer, tokenizer,
whitespace_pattern: bool, whitespace_pattern: bool,
allow_jump_forward: bool,
): ):
super().__init__() super().__init__()
...@@ -140,7 +139,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend): ...@@ -140,7 +139,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
self.outlines_tokenizer.vocabulary = ( self.outlines_tokenizer.vocabulary = (
self.outlines_tokenizer.tokenizer.get_vocab() self.outlines_tokenizer.tokenizer.get_vocab()
) )
self.allow_jump_forward = allow_jump_forward
self.whitespace_pattern = whitespace_pattern self.whitespace_pattern = whitespace_pattern
def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar: def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar:
...@@ -172,10 +170,7 @@ class OutlinesGrammarBackend(BaseGrammarBackend): ...@@ -172,10 +170,7 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
logger.warning(f"skip invalid regex schema: {regex=}, {e=}") logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
return None return None
if self.allow_jump_forward: jump_forward_map = None
jump_forward_map = OutlinesJumpForwardMap(regex)
else:
jump_forward_map = None
return OutlinesGrammar(guide, jump_forward_map) return OutlinesGrammar(guide, jump_forward_map)
......
...@@ -438,8 +438,8 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request): ...@@ -438,8 +438,8 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
return Response(status_code=200) return Response(status_code=200)
@app.post("/function_call") @app.post("/parse_function_call")
async def function_call_request(obj: ParseFunctionCallReq, request: Request): async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
""" """
A native API endpoint to parse function calls from a text. A native API endpoint to parse function calls from a text.
""" """
...@@ -492,7 +492,7 @@ def available_models(): ...@@ -492,7 +492,7 @@ def available_models():
@app.post("/v1/files") @app.post("/v1/files")
async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")): async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
return await v1_files_create( return await v1_files_create(
file, purpose, _global_state.tokenizer_manager.server_args.file_storage_pth file, purpose, _global_state.tokenizer_manager.server_args.file_storage_path
) )
......
...@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING ...@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
import torch import torch
from sglang.srt.layers.attention import AttentionBackend from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
......
...@@ -19,9 +19,8 @@ import triton ...@@ -19,9 +19,8 @@ import triton
import triton.language as tl import triton.language as tl
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.srt.layers.attention import AttentionBackend from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.layers.dp_attention import get_attention_tp_size
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
from sglang.srt.utils import is_flashinfer_available from sglang.srt.utils import is_flashinfer_available
......
...@@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Optional, Union ...@@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Optional, Union
import torch import torch
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.srt.layers.attention import AttentionBackend from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.layers.attention.flashinfer_backend import ( from sglang.srt.layers.attention.flashinfer_backend import (
create_flashinfer_kv_indices_triton, create_flashinfer_kv_indices_triton,
) )
...@@ -34,7 +34,6 @@ if is_flashinfer_available(): ...@@ -34,7 +34,6 @@ if is_flashinfer_available():
BatchMLAPagedAttentionWrapper, BatchMLAPagedAttentionWrapper,
BatchPrefillWithRaggedKVCacheWrapper, BatchPrefillWithRaggedKVCacheWrapper,
) )
from flashinfer.cascade import merge_state
@dataclass @dataclass
......
...@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING ...@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
import torch import torch
from torch.nn.functional import scaled_dot_product_attention from torch.nn.functional import scaled_dot_product_attention
from sglang.srt.layers.attention import AttentionBackend from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
if TYPE_CHECKING: if TYPE_CHECKING:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment