Unverified Commit 75ee0011 authored by Huapeng Zhou's avatar Huapeng Zhou Committed by GitHub
Browse files

[Doc] Fix SGLang tool parser doc (#9886)

parent ec15c836
......@@ -80,6 +80,7 @@
" --enable-lora \\\n",
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" --max-loras-per-batch 1 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -139,6 +140,7 @@
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
" --max-loras-per-batch 2 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -215,6 +217,7 @@
" --max-loras-per-batch 2 --lora-backend triton \\\n",
" --max-lora-rank 256\n",
" --lora-target-modules all\n",
" --log-level warning\n",
" \"\"\"\n",
")\n",
"\n",
......@@ -417,6 +420,7 @@
" {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n",
" {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n",
" lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n",
" --log-level warning\n",
" \"\"\"\n",
")\n",
"\n",
......
......@@ -67,7 +67,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n",
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
......
......@@ -70,7 +70,7 @@
" \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n",
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8\n",
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -126,7 +126,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --torch-compile-max-bs 2\n",
" --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -186,7 +186,7 @@
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -242,7 +242,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n",
" --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n",
" --cuda-graph-max-bs 2 --dtype float16\n",
" --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -297,7 +297,7 @@
" \"\"\"\n",
" python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
" --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
" --mem-fraction 0.5\n",
" --mem-fraction 0.5 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......
......@@ -51,7 +51,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
......
......@@ -47,7 +47,7 @@
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n",
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
......
......@@ -4,11 +4,29 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tool and Function Calling\n",
"# Tool Parser\n",
"\n",
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Currently supported parsers:\n",
"\n",
"| Parser | Supported Models | Notes |\n",
"|---|---|---|\n",
"| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
"| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
"| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
"| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n",
"| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n",
"| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
"| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
"| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -35,7 +53,7 @@
"from openai import OpenAI\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\" # qwen25\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n",
")\n",
"wait_for_server(f\"http://localhost:{port}\")"
]
......@@ -44,16 +62,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
"\n",
"- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n",
"- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n",
"- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
"Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
"- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
"- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n",
"- gpt-oss: GPT-OSS (e.g., openai/gpt-oss-120b, openai/gpt-oss-20b, lmsys/gpt-oss-120b-bf16, lmsys/gpt-oss-20b-bf16). Note: The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as role=\"tool\" messages, which enables the model to generate the final content.\n",
"- kimi_k2: moonshotai/Kimi-K2-Instruct"
"Note that `--tool-call-parser` defines the parser used to interpret responses."
]
},
{
......@@ -169,11 +178,11 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n",
"print_highlight(response_non_stream)\n",
"print_highlight(\"==== content ====\")\n",
"print(response_non_stream.choices[0].message.content)\n",
"print_highlight(response_non_stream.choices[0].message.content)\n",
"print_highlight(\"==== tool_calls ====\")\n",
"print(response_non_stream.choices[0].message.tool_calls)"
"print_highlight(response_non_stream.choices[0].message.tool_calls)"
]
},
{
......@@ -234,11 +243,11 @@
" if chunk.choices[0].delta.tool_calls:\n",
" tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
"print_highlight(\"==== Text ====\")\n",
"print(texts)\n",
"print_highlight(texts)\n",
"\n",
"print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n",
" print(tool_call)"
" print_highlight(tool_call)"
]
},
{
......@@ -350,10 +359,10 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(final_response)\n",
"print_highlight(final_response)\n",
"\n",
"print_highlight(\"==== Text ====\")\n",
"print(final_response.choices[0].message.content)"
"print_highlight(final_response.choices[0].message.content)"
]
},
{
......@@ -396,7 +405,7 @@
"}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print_highlight(\"==== Response ====\")\n",
"print(gen_response)\n",
"print_highlight(gen_response)\n",
"\n",
"# parse the response\n",
"parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
......@@ -463,8 +472,8 @@
"result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
"generated_text = result[\"text\"] # Assume there is only one prompt\n",
"\n",
"print(\"=== Offline Engine Output Text ===\")\n",
"print(generated_text)\n",
"print_highlight(\"=== Offline Engine Output Text ===\")\n",
"print_highlight(generated_text)\n",
"\n",
"\n",
"# 2) Parse using FunctionCallParser\n",
......@@ -485,13 +494,13 @@
"parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
"normal_text, calls = parser.parse_non_stream(generated_text)\n",
"\n",
"print(\"=== Parsing Result ===\")\n",
"print_highlight(\"=== Parsing Result ===\")\n",
"print(\"Normal text portion:\", normal_text)\n",
"print(\"Function call portion:\")\n",
"print_highlight(\"Function call portion:\")\n",
"for call in calls:\n",
" # call: ToolCallItem\n",
" print(f\" - tool name: {call.name}\")\n",
" print(f\" parameters: {call.parameters}\")\n",
" print_highlight(f\" - tool name: {call.name}\")\n",
" print_highlight(f\" parameters: {call.parameters}\")\n",
"\n",
"# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
]
......@@ -537,7 +546,7 @@
"\n",
"# Start a new server session for tool choice examples\n",
"server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n",
")\n",
"wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
"\n",
......@@ -628,8 +637,8 @@
"\n",
"if response_specific.choices[0].message.tool_calls:\n",
" tool_call = response_specific.choices[0].message.tool_calls[0]\n",
" print(f\"Called function: {tool_call.function.name}\")\n",
" print(f\"Arguments: {tool_call.function.arguments}\")"
" print_highlight(f\"Called function: {tool_call.function.name}\")\n",
" print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
]
},
{
......@@ -682,7 +691,7 @@
"import openai\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1\" # llama-3.2-1b-instruct\n",
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n",
")\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
"\n",
......@@ -762,7 +771,7 @@
" tools=tools,\n",
")\n",
"print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n",
"print_highlight(response_non_stream)\n",
"\n",
"response_stream = client.chat.completions.create(\n",
" model=model_name,\n",
......@@ -785,11 +794,11 @@
"\n",
"print_highlight(\"Streaming Response:\")\n",
"print_highlight(\"==== Text ====\")\n",
"print(texts)\n",
"print_highlight(texts)\n",
"\n",
"print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n",
" print(tool_call)\n",
" print_highlight(tool_call)\n",
"\n",
"terminate_process(server_process)"
]
......
......@@ -43,7 +43,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
......@@ -267,7 +267,7 @@
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -316,7 +316,7 @@
"reranker_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding\n",
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -376,7 +376,7 @@
"\n",
"reward_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......@@ -441,7 +441,7 @@
"outputs": [],
"source": [
"expert_record_server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat\"\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
......
......@@ -36,7 +36,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
......
......@@ -33,7 +33,7 @@
"embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n",
" --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......
......@@ -35,7 +35,7 @@
"\n",
"vision_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......
......@@ -34,7 +34,7 @@
"server_process, port = launch_server_cmd(\n",
" \"\"\"\n",
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
" --host 0.0.0.0\n",
" --host 0.0.0.0 --log-level warning\n",
"\"\"\"\n",
")\n",
"\n",
......
......@@ -38,7 +38,7 @@ The core features include:
advanced_features/speculative_decoding.ipynb
advanced_features/structured_outputs.ipynb
advanced_features/structured_outputs_for_reasoning_models.ipynb
advanced_features/function_calling.ipynb
advanced_features/tool_parser.ipynb
advanced_features/separate_reasoning.ipynb
advanced_features/quantization.md
advanced_features/lora.ipynb
......
......@@ -39,7 +39,7 @@
"from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
......@@ -395,7 +395,7 @@
"outputs": [],
"source": [
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
......
......@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
To reduce the log length, we set the log level to warning for the server, the default log level is info.
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
"""
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment