Unverified Commit 75ee0011 authored by Huapeng Zhou's avatar Huapeng Zhou Committed by GitHub
Browse files

[Doc] Fix SGLang tool parser doc (#9886)

parent ec15c836
...@@ -80,6 +80,7 @@ ...@@ -80,6 +80,7 @@
" --enable-lora \\\n", " --enable-lora \\\n",
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" --max-loras-per-batch 1 --lora-backend triton \\\n", " --max-loras-per-batch 1 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -139,6 +140,7 @@ ...@@ -139,6 +140,7 @@
" --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
" lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n", " lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
" --max-loras-per-batch 2 --lora-backend triton \\\n", " --max-loras-per-batch 2 --lora-backend triton \\\n",
" --log-level warning \\\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -215,6 +217,7 @@ ...@@ -215,6 +217,7 @@
" --max-loras-per-batch 2 --lora-backend triton \\\n", " --max-loras-per-batch 2 --lora-backend triton \\\n",
" --max-lora-rank 256\n", " --max-lora-rank 256\n",
" --lora-target-modules all\n", " --lora-target-modules all\n",
" --log-level warning\n",
" \"\"\"\n", " \"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -417,6 +420,7 @@ ...@@ -417,6 +420,7 @@
" {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n", " {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n",
" {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n", " {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n",
" lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n", " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n",
" --log-level warning\n",
" \"\"\"\n", " \"\"\"\n",
")\n", ")\n",
"\n", "\n",
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n", " \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")" "wait_for_server(f\"http://localhost:{port}\")"
......
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n",
" --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8\n", " --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -126,7 +126,7 @@ ...@@ -126,7 +126,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
" --enable-torch-compile --torch-compile-max-bs 2\n", " --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -186,7 +186,7 @@ ...@@ -186,7 +186,7 @@
"python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n", "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
" --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
" --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 \n", " --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -242,7 +242,7 @@ ...@@ -242,7 +242,7 @@
"python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n", "python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --speculative-algorithm EAGLE3 \\\n",
" --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n", " --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n",
" --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n", " --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n",
" --cuda-graph-max-bs 2 --dtype float16\n", " --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -297,7 +297,7 @@ ...@@ -297,7 +297,7 @@
" \"\"\"\n", " \"\"\"\n",
" python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n", " python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
" --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n", " --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
" --mem-fraction 0.5\n", " --mem-fraction 0.5 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
......
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
"\n", "\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0\"\n", " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
......
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
"\n", "\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1\"\n", " \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
......
...@@ -4,11 +4,29 @@ ...@@ -4,11 +4,29 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Tool and Function Calling\n", "# Tool Parser\n",
"\n", "\n",
"This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality." "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Currently supported parsers:\n",
"\n",
"| Parser | Supported Models | Notes |\n",
"|---|---|---|\n",
"| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
"| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
"| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
"| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n",
"| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n",
"| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
"| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
"| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
...@@ -35,7 +53,7 @@ ...@@ -35,7 +53,7 @@
"from openai import OpenAI\n", "from openai import OpenAI\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\" # qwen25\n", " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n",
")\n", ")\n",
"wait_for_server(f\"http://localhost:{port}\")" "wait_for_server(f\"http://localhost:{port}\")"
] ]
...@@ -44,16 +62,7 @@ ...@@ -44,16 +62,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n", "Note that `--tool-call-parser` defines the parser used to interpret responses."
"\n",
"- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n",
"- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n",
"- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
"Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
"- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
"- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n",
"- gpt-oss: GPT-OSS (e.g., openai/gpt-oss-120b, openai/gpt-oss-20b, lmsys/gpt-oss-120b-bf16, lmsys/gpt-oss-20b-bf16). Note: The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as role=\"tool\" messages, which enables the model to generate the final content.\n",
"- kimi_k2: moonshotai/Kimi-K2-Instruct"
] ]
}, },
{ {
...@@ -169,11 +178,11 @@ ...@@ -169,11 +178,11 @@
" tools=tools,\n", " tools=tools,\n",
")\n", ")\n",
"print_highlight(\"Non-stream response:\")\n", "print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n", "print_highlight(response_non_stream)\n",
"print_highlight(\"==== content ====\")\n", "print_highlight(\"==== content ====\")\n",
"print(response_non_stream.choices[0].message.content)\n", "print_highlight(response_non_stream.choices[0].message.content)\n",
"print_highlight(\"==== tool_calls ====\")\n", "print_highlight(\"==== tool_calls ====\")\n",
"print(response_non_stream.choices[0].message.tool_calls)" "print_highlight(response_non_stream.choices[0].message.tool_calls)"
] ]
}, },
{ {
...@@ -234,11 +243,11 @@ ...@@ -234,11 +243,11 @@
" if chunk.choices[0].delta.tool_calls:\n", " if chunk.choices[0].delta.tool_calls:\n",
" tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n", " tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
"print_highlight(\"==== Text ====\")\n", "print_highlight(\"==== Text ====\")\n",
"print(texts)\n", "print_highlight(texts)\n",
"\n", "\n",
"print_highlight(\"==== Tool Call ====\")\n", "print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n", "for tool_call in tool_calls:\n",
" print(tool_call)" " print_highlight(tool_call)"
] ]
}, },
{ {
...@@ -350,10 +359,10 @@ ...@@ -350,10 +359,10 @@
" tools=tools,\n", " tools=tools,\n",
")\n", ")\n",
"print_highlight(\"Non-stream response:\")\n", "print_highlight(\"Non-stream response:\")\n",
"print(final_response)\n", "print_highlight(final_response)\n",
"\n", "\n",
"print_highlight(\"==== Text ====\")\n", "print_highlight(\"==== Text ====\")\n",
"print(final_response.choices[0].message.content)" "print_highlight(final_response.choices[0].message.content)"
] ]
}, },
{ {
...@@ -396,7 +405,7 @@ ...@@ -396,7 +405,7 @@
"}\n", "}\n",
"gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n", "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
"print_highlight(\"==== Response ====\")\n", "print_highlight(\"==== Response ====\")\n",
"print(gen_response)\n", "print_highlight(gen_response)\n",
"\n", "\n",
"# parse the response\n", "# parse the response\n",
"parse_url = f\"http://localhost:{port}/parse_function_call\"\n", "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
...@@ -463,8 +472,8 @@ ...@@ -463,8 +472,8 @@
"result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n", "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
"generated_text = result[\"text\"] # Assume there is only one prompt\n", "generated_text = result[\"text\"] # Assume there is only one prompt\n",
"\n", "\n",
"print(\"=== Offline Engine Output Text ===\")\n", "print_highlight(\"=== Offline Engine Output Text ===\")\n",
"print(generated_text)\n", "print_highlight(generated_text)\n",
"\n", "\n",
"\n", "\n",
"# 2) Parse using FunctionCallParser\n", "# 2) Parse using FunctionCallParser\n",
...@@ -485,13 +494,13 @@ ...@@ -485,13 +494,13 @@
"parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n", "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
"normal_text, calls = parser.parse_non_stream(generated_text)\n", "normal_text, calls = parser.parse_non_stream(generated_text)\n",
"\n", "\n",
"print(\"=== Parsing Result ===\")\n", "print_highlight(\"=== Parsing Result ===\")\n",
"print(\"Normal text portion:\", normal_text)\n", "print(\"Normal text portion:\", normal_text)\n",
"print(\"Function call portion:\")\n", "print_highlight(\"Function call portion:\")\n",
"for call in calls:\n", "for call in calls:\n",
" # call: ToolCallItem\n", " # call: ToolCallItem\n",
" print(f\" - tool name: {call.name}\")\n", " print_highlight(f\" - tool name: {call.name}\")\n",
" print(f\" parameters: {call.parameters}\")\n", " print_highlight(f\" parameters: {call.parameters}\")\n",
"\n", "\n",
"# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc." "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
] ]
...@@ -537,7 +546,7 @@ ...@@ -537,7 +546,7 @@
"\n", "\n",
"# Start a new server session for tool choice examples\n", "# Start a new server session for tool choice examples\n",
"server_process_tool_choice, port_tool_choice = launch_server_cmd(\n", "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n", " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"wait_for_server(f\"http://localhost:{port_tool_choice}\")\n", "wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
"\n", "\n",
...@@ -628,8 +637,8 @@ ...@@ -628,8 +637,8 @@
"\n", "\n",
"if response_specific.choices[0].message.tool_calls:\n", "if response_specific.choices[0].message.tool_calls:\n",
" tool_call = response_specific.choices[0].message.tool_calls[0]\n", " tool_call = response_specific.choices[0].message.tool_calls[0]\n",
" print(f\"Called function: {tool_call.function.name}\")\n", " print_highlight(f\"Called function: {tool_call.function.name}\")\n",
" print(f\"Arguments: {tool_call.function.arguments}\")" " print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
] ]
}, },
{ {
...@@ -682,7 +691,7 @@ ...@@ -682,7 +691,7 @@
"import openai\n", "import openai\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1\" # llama-3.2-1b-instruct\n", " \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n",
")\n", ")\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
"\n", "\n",
...@@ -762,7 +771,7 @@ ...@@ -762,7 +771,7 @@
" tools=tools,\n", " tools=tools,\n",
")\n", ")\n",
"print_highlight(\"Non-stream response:\")\n", "print_highlight(\"Non-stream response:\")\n",
"print(response_non_stream)\n", "print_highlight(response_non_stream)\n",
"\n", "\n",
"response_stream = client.chat.completions.create(\n", "response_stream = client.chat.completions.create(\n",
" model=model_name,\n", " model=model_name,\n",
...@@ -785,11 +794,11 @@ ...@@ -785,11 +794,11 @@
"\n", "\n",
"print_highlight(\"Streaming Response:\")\n", "print_highlight(\"Streaming Response:\")\n",
"print_highlight(\"==== Text ====\")\n", "print_highlight(\"==== Text ====\")\n",
"print(texts)\n", "print_highlight(texts)\n",
"\n", "\n",
"print_highlight(\"==== Tool Call ====\")\n", "print_highlight(\"==== Tool Call ====\")\n",
"for tool_call in tool_calls:\n", "for tool_call in tool_calls:\n",
" print(tool_call)\n", " print_highlight(tool_call)\n",
"\n", "\n",
"terminate_process(server_process)" "terminate_process(server_process)"
] ]
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")" "wait_for_server(f\"http://localhost:{port}\")"
...@@ -267,7 +267,7 @@ ...@@ -267,7 +267,7 @@
"embedding_process, port = launch_server_cmd(\n", "embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n", " --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -316,7 +316,7 @@ ...@@ -316,7 +316,7 @@
"reranker_process, port = launch_server_cmd(\n", "reranker_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n", "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
" --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding\n", " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -376,7 +376,7 @@ ...@@ -376,7 +376,7 @@
"\n", "\n",
"reward_process, port = launch_server_cmd(\n", "reward_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n", "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
...@@ -441,7 +441,7 @@ ...@@ -441,7 +441,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"expert_record_server_process, port = launch_server_cmd(\n", "expert_record_server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat\"\n", " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")" "wait_for_server(f\"http://localhost:{port}\")"
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n", "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n", " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
......
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
"embedding_process, port = launch_server_cmd(\n", "embedding_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n", "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
" --host 0.0.0.0 --is-embedding\n", " --host 0.0.0.0 --is-embedding --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
......
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
"\n", "\n",
"vision_process, port = launch_server_cmd(\n", "vision_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct\n", "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
......
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"\"\"\n", " \"\"\"\n",
"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n", "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
" --host 0.0.0.0\n", " --host 0.0.0.0 --log-level warning\n",
"\"\"\"\n", "\"\"\"\n",
")\n", ")\n",
"\n", "\n",
......
...@@ -38,7 +38,7 @@ The core features include: ...@@ -38,7 +38,7 @@ The core features include:
advanced_features/speculative_decoding.ipynb advanced_features/speculative_decoding.ipynb
advanced_features/structured_outputs.ipynb advanced_features/structured_outputs.ipynb
advanced_features/structured_outputs_for_reasoning_models.ipynb advanced_features/structured_outputs_for_reasoning_models.ipynb
advanced_features/function_calling.ipynb advanced_features/tool_parser.ipynb
advanced_features/separate_reasoning.ipynb advanced_features/separate_reasoning.ipynb
advanced_features/quantization.md advanced_features/quantization.md
advanced_features/lora.ipynb advanced_features/lora.ipynb
......
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
"from sglang.utils import print_highlight, terminate_process, wait_for_server\n", "from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
"\n", "\n",
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n", " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
...@@ -395,7 +395,7 @@ ...@@ -395,7 +395,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"server_process, port = launch_server_cmd(\n", "server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n", " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
")\n", ")\n",
"\n", "\n",
"wait_for_server(f\"http://localhost:{port}\")\n", "wait_for_server(f\"http://localhost:{port}\")\n",
......
...@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: ...@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
NOTE: Typically, the server runs in a separate terminal. NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined. In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue. To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
To reduce the log length, we set the log level to warning for the server, the default log level is info.
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance. We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
""" """
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment