Misc clean up; Remove the support of jump forward (#4032)

935cda94 · Lianmin Zheng · GitHub · 110e0066 · 935cda94 · 935cda94
Unverified Commit 935cda94 authored Mar 03, 2025 by Lianmin Zheng Committed by GitHub Mar 03, 2025
20 changed files
--- a/docs/backend/function_calling.ipynb
+++ b/docs/backend/function_calling.ipynb
@@ -385,7 +385,7 @@
    "print(gen_response)\n",
    "\n",
    "# parse the response\n",
-    "parse_url = f\"http://localhost:{port}/function_call\"\n",
+    "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
    "\n",
    "function_call_input = {\n",
    "    \"text\": gen_response,\n",

--- a/docs/backend/sampling_params.md
+++ b/docs/backend/sampling_params.md
-# Sampling Parameters
+# Sampling Parameters in SGLang Runtime
 This doc describes the sampling parameters of the SGLang Runtime.
 It is the low-level endpoint of the runtime.
-If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](https://docs.sglang.ai/backend/openai_api_completions.html).
+If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](../backend/openai_api_completions.ipynb).
-## `/generate` Endpoint
+The `/generate` endpoint accepts the following arguments in the JSON format. You can code examples below.
-The `/generate` endpoint accepts the following parameters in JSON format. For in detail usage see the [native api doc](https://docs.sglang.ai/backend/native_api.html).
+```python
+@dataclass
+class GenerateReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can specify either text or input_ids
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # The image input. It can be a file name, a url, or base64 encoded string.
+    # See also python/sglang/srt/utils.py:load_image.
+    image_data: Optional[Union[List[str], str]] = None
+    # The sampling_params. See descriptions below.
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Whether to return logprobs.
+    return_logprob: Optional[Union[List[bool], bool]] = None
+    # If return logprobs, the start location in the prompt for returning logprobs.
+    # By default, this value is "-1", which means it will only return logprobs for output tokens.
+    logprob_start_len: Optional[Union[List[int], int]] = None
+    # If return logprobs, the number of top logprobs to return at each position.
+    top_logprobs_num: Optional[Union[List[int], int]] = None
+    # If return logprobs, the token ids to return logprob for.
+    token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
+    # Whether to detokenize tokens in text in the returned logprobs.
+    return_text_in_logprobs: bool = False
+    # Whether to stream output.
+    stream: bool = False
-* `prompt`: The input prompt. Can be a single prompt or a batch of prompts. `Optional[Union[List[str], str]] = None`
+    # The modalities of the image data [image, multi-images, video]
-* `input_ids`: Alternative to `text`. Specify the input as token IDs instead of text. `Optional[Union[List[List[int]], List[int]]] = None`
+    modalities: Optional[List[str]] = None
-* `sampling_params`: The sampling parameters as described in the sections below.  `Optional[Union[List[Dict], Dict]] = None`
+    # LoRA related
-* `return_logprob`: Whether to return log probabilities for tokens. `Optional[Union[List[bool], bool]] = None`
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-* `logprob_start_len`: If returning log probabilities, specifies the start position in the prompt. Default is "-1" which returns logprobs only for output tokens. `Optional[Union[List[int], int]] = None`
-* `top_logprobs_num`: If returning log probabilities, specifies the number of top logprobs to return at each position. `Optional[Union[List[int], int]] = None`
-* `stream`: Whether to stream the output. `bool = False`
-* `lora_path`: Path to LoRA weights. `Optional[Union[List[Optional[str]], Optional[str]]] = None`
-* `custom_logit_processor`: Custom logit processor for advanced sampling control. For usage see below. `Optional[Union[List[Optional[str]], str]] = None`
-* `return_hidden_states`: Whether to return hidden states of the model. Note that each time it changes, the cuda graph will be recaptured, which might lead to a performance hit. See the [examples](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/hidden_states.py) for more information. `bool = False`
-## Sampling params
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
-### Core Parameters
+    # Whether to return hidden states
+    return_hidden_states: bool = False
+```
-* `max_new_tokens`: The maximum output length measured in tokens. `int = 128`
+The `sampling_params` follows this format
-* `stop`: One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. `Optional[Union[str, List[str]]] = None`
-* `stop_token_ids`: Provide stop words in form of token ids. Generation will stop if one of these token ids is sampled. `Optional[List[int]] = []`
-* `temperature`: [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, higher temperature leads to more diversity. `float = 1.0`
-* `top_p`: [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. `top_p: float = 1.0`
-* `top_k`: [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. `int = -1`
-* `min_p`: [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. `float = 0.0`
-### Penalizers
+```python
+# The maximum number of output tokens
+max_new_tokens: int = 128,
+# Stop when hitting any of the strings in this list
+stop: Optional[Union[str, List[str]]] = None,
+# Stop when hitting any of the token_ids in this list
+stop_token_ids: Optional[List[int]] = [],
+# Sampling temperature
+temperature: float = 1.0,
+# Top-p sampling
+top_p: float = 1.0,
+# Top-k sampling
+top_k: int = -1,
+# Min-p sampling
+min_p: float = 0.0,
+# Do parallel sampling and return `n` outputs.
+n: int = 1,
-To use penalizers you will need to `--disable-overlap`. Please note that this might degrade performance.
+## Structured Outputs
+# Only one of the below three can be set for a request.
-* `frequency_penalty`: Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. `float = 0.0`
+# Constrain the output to follow a given JSON schema.
-* `presence_penalty`: Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occured. `float = 0.0`
+json_schema: Optional[str] = None,
-* `repetition_penalty`: Penalizes tokens if they appeared in prompt or generation so far. Must be between `0` and `2` where numbers smaller than `1` encourage repeatment of tokens and numbers larger than `2` encourages sampling of new tokens. The penalization scales multiplicatively. `float = 0.0`
+# Constrain the output to follow a given regular expression.
-* `min_new_tokens`: Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior for example if the distribution is highly skewed towards these tokens. `int = 0`
+regex: Optional[str] = None,
+# Constrain the output to follow a given EBNF grammar.
+ebnf: Optional[str] = None,
-### Constrained decoding
+## Penalties
-Please refer to our dedicated guide on [constrained decoding](https://docs.sglang.ai/backend/structured_outputs.html#Native-API-and-SGLang-Runtime-(SRT)) for the following parameters.
+# Float that penalizes new tokens based on their frequency in the generated text so far.
+# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to
+# repeat tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
+frequency_penalty: float = 0.0,
+# Float that penalizes new tokens based on whether they appear in the generated text so far.
+# Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat
+# tokens. Must be -2 <= value <= 2. Setting to 0 (default) will disable this penalty.
+presence_penalty: float = 0.0,
+# Guides inference to generate at least this number of tokens by penalizing logits of tokenizer's
+# EOS token and `stop_token_ids` to -inf, until the output token reaches given length.
+# Note that any of the `stop` string can be generated before reaching `min_new_tokens`, as it is
+# difficult to infer the correct token ID by given `stop` strings.
+# Must be 0 <= value < max_new_tokens. Setting to 0 (default) will disable this penalty.
+min_new_tokens: int = 0,
-* `json_schema`: `Optional[str] = None`
+# Whether to ignore EOS token
-* `regex`: `Optional[str] = None`
+ignore_eos: bool = False,
-* `ebnf`: `Optional[str] = None`
+# Whether to skip the special tokens during detokenization
+skip_special_tokens: bool = True,
+# Whether to add spaces between special tokens during detokenization
+spaces_between_special_tokens: bool = True,
-### Other options
+## Custom Parameters for Custom Logit Processor.
+# A dictionary of custom parameters for the custom logit processor.
+# The custom logit processor takes a list of dictionaries as input, where each
+# dictionary is the custom parameters for one token in a batch of the input.
+# See also python/sglang/srt/sampling/custom_logit_processor.py
+custom_params: Optional[Dict[str, Any]] = None,
+```
-* `n`: Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeat the same prompts for several times offer better control and efficiency.) `int = 1`
+## Examples
-* `spaces_between_special_tokens`: Whether or not to add spaces between special tokens during detokenization. `bool = True`
-* `no_stop_trim`: Don't trim stop words or EOS token from the generated text. `bool = False`
-* `ignore_eos`: Don't stop generation when EOS token is sampled. `bool = False`
-* `skip_special_tokens`: Remove special tokens during decoding. `bool = True`
-* `custom_params`: Used when employing `CustomLogitProcessor`. For usage see below. `Optional[List[Optional[Dict[str, Any]]]] = None`
-### Custom Logit Processor
+### Normal
+Launch a server
+```
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
+```
-Launch a server with `--enable-custom-logit-processor` flag on.
+Send a request
+```python
+import requests
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+### Streaming
+Send a request and stream the output
+```python
+import requests, json
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+        "stream": True,
+    },
+    stream=True,
+)
+prev = 0
+for chunk in response.iter_lines(decode_unicode=False):
+    chunk = chunk.decode("utf-8")
+    if chunk and chunk.startswith("data:"):
+        if chunk == "data: [DONE]":
+            break
+        data = json.loads(chunk[5:].strip("\n"))
+        output = data["text"].strip()
+        print(output[prev:], end="", flush=True)
+        prev = len(output)
+print("")
+```
+### Multi modal
+Launch a server
+```
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --chat-template chatml-llava
+```
+Download an image
+```
+curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
+```
+Send a request
+```python
+import requests
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
+                "<|im_start|>assistant\n",
+        "image_data": "example_image.png",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
+Streaming is supported in a similar manner as [above](#streaming).
+### Structured Outputs (JSON, Regex, EBNF)
+You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
+SGLang supports two grammar backends:
+- [Outlines](https://github.com/dottxt-ai/outlines) (default): Supports JSON schema and regular expression constraints.
+- [XGrammar](https://github.com/mlc-ai/xgrammar): Supports JSON schema, regular expression, and EBNF constraints.
+  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md)
+Initialize the XGrammar backend using `--grammar-backend xgrammar` flag
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: outlines)
+```
+```python
+import json
+import requests
+json_schema = json.dumps({
+    "type": "object",
+    "properties": {
+        "name": {"type": "string", "pattern": "^[\\w]+$"},
+        "population": {"type": "integer"},
+    },
+    "required": ["name", "population"],
+})
+# JSON (works with both Outlines and XGrammar)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Here is the information of the capital of France in the JSON format.\n",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "json_schema": json_schema,
+        },
+    },
+)
+print(response.json())
+# Regular expression (Outlines backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Paris is the capital of",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "regex": "(France|England)",
+        },
+    },
+)
+print(response.json())
+# EBNF (XGrammar backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Write a greeting.",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
+        },
+    },
+)
+print(response.json())
+```
+### Custom Logit Processor
+Launch a server with `--enable-custom-logit-processor` flag on.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
 ```
 Define a custom logit processor that will always sample a specific token id.
 ```python
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
@@ -89,7 +301,6 @@ class DeterministicLogitProcessor(CustomLogitProcessor):
 ```
 Send a request
 ```python
 import requests

--- a/docs/backend/server_arguments.md
+++ b/docs/backend/server_arguments.md
@@ -68,7 +68,7 @@ Please consult the documentation below to learn more about the parameters you ma
 ### API configuration
 * `api_key`: Sets an API key for the server and the OpenAI-compatible API.
-* `file_storage_pth`: Directory for storing uploaded or generated files from API calls.
+* `file_storage_path`: Directory for storing uploaded or generated files from API calls.
 * `enable_cache_report`: If set, includes detailed usage of cached tokens in the response usage.
 ## Parallelism
@@ -162,7 +162,6 @@ Please consult the documentation below to learn more about the parameters you ma
 *Note: We recommend to stay with the defaults and only use these options for debugging for best possible performance.*
 * `disable_radix_cache`: Disable [Radix](https://lmsys.org/blog/2024-01-17-sglang/) backend for prefix caching.
-* `disable_jump_forward`: Disable [jump-forward](https://lmsys.org/blog/2024-02-05-compressed-fsm/#our-method-jump-forward-decoding-with-a-compressed-finite-state-machine) for outlines grammar backend.
 * `disable_cuda_graph`: Disable [cuda graph](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/) for model forward. Use if encountering uncorrectable CUDA ECC errors.
 * `disable_cuda_graph_padding`: Disable cuda graph when padding is needed. In other case still use cuda graph.
 * `disable_outlines_disk_cache`: Disable disk cache for outlines grammar backend.

--- a/docs/backend/speculative_decoding.ipynb
+++ b/docs/backend/speculative_decoding.ipynb
@@ -47,7 +47,7 @@
    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algorithm EAGLE \\\n",
-    "    --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
    "    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64\n",
    "\"\"\"\n",
    ")\n",
@@ -104,7 +104,7 @@
    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algorithm EAGLE \\\n",
-    "    --speculative-draft-model-path lmzheng/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
    "        --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
    "            --enable-torch-compile --cuda-graph-max-bs 2\n",
    "\"\"\"\n",
@@ -175,7 +175,7 @@
    "server_process, port = launch_server_cmd(\n",
    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
-    "    --speculative-draft-model-path lmzheng/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
    "    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
    "    --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n",
    "\"\"\"\n",

--- a/docs/references/contribution_guide.md
+++ b/docs/references/contribution_guide.md
@@ -43,4 +43,4 @@ If you want to contribute but don’t have a specific idea in mind, pick issues
 If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw).
-Thank you for your interest in SGLang—**happy coding**!
+Thank you for your interest in SGLang. Happy coding!
--- a/docs/references/multi_node.md
+++ b/docs/references/multi_node.md
@@ -71,7 +71,7 @@ srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
    --model-path "$model" \
    --grammar-backend "xgrammar" \
    --tp "$tp_size" \
-    --nccl-init-addr "$NCCL_INIT_ADDR" \
+    --dist-init-addr "$NCCL_INIT_ADDR" \
    --nnodes 2 \
    --node-rank "$SLURM_NODEID" &

--- a/docs/start/install.md
+++ b/docs/start/install.md
@@ -2,9 +2,10 @@
 You can install SGLang using any of the methods below.
-For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the [latest version](https://pypi.org/project/sglang/#history) and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related problems.
+For running DeepSeek V3/R1, refer to [DeepSeek V3 Support](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3). It is recommended to use the latest version and deploy it with [Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended) to avoid environment-related issues.
+It is recommended to use uv to install the dependencies for faster installation:
-We recommend using uv to install the dependencies with a higher installation speed:
 ## Method 1: With pip or uv
 ```bash
@@ -13,14 +14,13 @@ pip install uv
 uv pip install "sglang[all]>=0.4.3.post2" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python
 ```
-**Quick Fixes to Installation**
+**Quick Fixes to Common Problems**
 - SGLang currently uses torch 2.5, so you need to install flashinfer for torch 2.5. If you want to install flashinfer separately, please refer to [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html). Please note that the FlashInfer pypi package is called `flashinfer-python` instead of `flashinfer`.
- If you encounter `OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root`, please try either of the following solutions:
+- If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
+  1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
-1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
+  2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
-2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
 - If you encounter `ImportError; cannot import name 'is_valid_list_of_images' from 'transformers.models.llama.image_processing_llama'`, try to use the specified version of `transformers` in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/python/pyproject.toml). Currently, just running `pip install transformers==4.48.3`.

--- a/examples/runtime/engine/EAGLE_offline_batch_inference.py
+++ b/examples/runtime/engine/EAGLE_offline_batch_inference.py
@@ -17,7 +17,7 @@ def main():
    llm = sgl.Engine(
        model_path="meta-llama/Llama-2-7b-chat-hf",
        speculative_algorithm="EAGLE",
-        speculative_draft_model_path="lmzheng/sglang-EAGLE-llama2-chat-7B",
+        speculative_draft_model_path="lmsys/sglang-EAGLE-llama2-chat-7B",
        speculative_num_steps=3,
        speculative_eagle_topk=4,
        speculative_num_draft_tokens=16,

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -52,7 +52,7 @@ srt = [
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "sgl-kernel>=0.0.3.post1", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
+srt_hip = ["sglang[runtime_common]", "sgl-kernel==0.0.3.post6", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm

--- a/python/sglang/README.md
+++ b/python/sglang/README.md
@@ -12,6 +12,5 @@
 - `global_config.py`: The global configs and constants.
 - `launch_server.py`: The entry point for launching the local server.
 - `llama3_eval.py`: Evaluation of Llama 3 using the Meta Llama dataset.
- `profiler.py`: Profile a running server.
 - `utils.py`: Common utilities.
 - `version.py`: Version info.
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
-raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -4,6 +4,13 @@ import os
 class GlobalConfig:
+    """
+    Store some global constants.
+    See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
+    many global runtime arguments as well.
+    """
    def __init__(self):
        # Verbosity level
        # 0: do not output anything

--- a/python/sglang/srt/constrained/base_grammar_backend.py
+++ b/python/sglang/srt/constrained/base_grammar_backend.py
@@ -80,7 +80,6 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
        grammar_backend = OutlinesGrammarBackend(
            tokenizer,
            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
-            allow_jump_forward=not server_args.disable_jump_forward,
        )
    elif server_args.grammar_backend == "xgrammar":
        from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend

--- a/python/sglang/srt/constrained/outlines_backend.py
+++ b/python/sglang/srt/constrained/outlines_backend.py
@@ -115,7 +115,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
        self,
        tokenizer,
        whitespace_pattern: bool,
-        allow_jump_forward: bool,
    ):
        super().__init__()
@@ -140,7 +139,6 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
            self.outlines_tokenizer.vocabulary = (
                self.outlines_tokenizer.tokenizer.get_vocab()
            )
-        self.allow_jump_forward = allow_jump_forward
        self.whitespace_pattern = whitespace_pattern
    def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar:
@@ -172,10 +170,7 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
            logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
            return None
-        if self.allow_jump_forward:
+        jump_forward_map = None
-            jump_forward_map = OutlinesJumpForwardMap(regex)
-        else:
-            jump_forward_map = None
        return OutlinesGrammar(guide, jump_forward_map)

--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -438,8 +438,8 @@ async def configure_logging(obj: ConfigureLoggingReq, request: Request):
    return Response(status_code=200)
-@app.post("/function_call")
+@app.post("/parse_function_call")
-async def function_call_request(obj: ParseFunctionCallReq, request: Request):
+async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
    """
    A native API endpoint to parse function calls from a text.
    """
@@ -492,7 +492,7 @@ def available_models():
 @app.post("/v1/files")
 async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
    return await v1_files_create(
-        file, purpose, _global_state.tokenizer_manager.server_args.file_storage_pth
+        file, purpose, _global_state.tokenizer_manager.server_args.file_storage_path
    )

--- a/python/sglang/srt/layers/attention/__init__.py
+++ b/python/sglang/srt/layers/attention/__init__.py
--- a/python/sglang/srt/layers/attention/double_sparsity_backend.py
+++ b/python/sglang/srt/layers/attention/double_sparsity_backend.py
@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
 import torch
-from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch

--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -19,9 +19,8 @@ import triton
 import triton.language as tl
 from sglang.global_config import global_config
-from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.dp_attention import get_attention_tp_size
-from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.utils import is_flashinfer_available

--- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Optional, Union
 import torch
 from sglang.global_config import global_config
-from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.layers.attention.flashinfer_backend import (
    create_flashinfer_kv_indices_triton,
 )
@@ -34,7 +34,6 @@ if is_flashinfer_available():
        BatchMLAPagedAttentionWrapper,
        BatchPrefillWithRaggedKVCacheWrapper,
    )
-    from flashinfer.cascade import merge_state
 @dataclass

--- a/python/sglang/srt/layers/attention/torch_native_backend.py
+++ b/python/sglang/srt/layers/attention/torch_native_backend.py
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
 import torch
 from torch.nn.functional import scaled_dot_product_attention
-from sglang.srt.layers.attention import AttentionBackend
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 if TYPE_CHECKING: