Unverified Commit acd1a159 authored by simveit's avatar simveit Committed by GitHub
Browse files

Docs: Implemented frontend docs (#3791)


Co-authored-by: default avatarChayenne <zhaochen20@outlook.com>
parent 7c1692aa
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
"Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)." "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)."
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Advanced Usage\n",
"\n",
"The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/hidden_states.py). \n",
"\n",
"Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
...@@ -39,14 +50,22 @@ ...@@ -39,14 +50,22 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# launch the offline engine\n", "# launch the offline engine\n",
"from sglang.utils import stream_and_merge, async_stream_and_merge\n",
"import sglang as sgl\n",
"import asyncio\n", "import asyncio\n",
"import io\n",
"import os\n",
"\n",
"from PIL import Image\n",
"import requests\n",
"import sglang as sgl\n",
"\n",
"from sglang.srt.conversation import chat_templates\n",
"from sglang.test.test_utils import is_in_ci\n", "from sglang.test.test_utils import is_in_ci\n",
"from sglang.utils import async_stream_and_merge, stream_and_merge\n",
"\n", "\n",
"if is_in_ci():\n", "if is_in_ci():\n",
" import patch\n", " import patch\n",
"\n", "\n",
"\n",
"llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")" "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
] ]
}, },
...@@ -185,57 +204,6 @@ ...@@ -185,57 +204,6 @@
"asyncio.run(main())" "asyncio.run(main())"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"llm.shutdown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Return Hidden States"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"llm = sgl.Engine(\n",
" model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", return_hidden_states=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompts = [\n",
" \"Hello, my name is\",\n",
" \"The president of the United States is\",\n",
" \"The capital of France is\",\n",
" \"The future of AI is\",\n",
"]\n",
"\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"max_new_tokens\": 10}\n",
"\n",
"outputs = llm.generate(prompts, sampling_params=sampling_params)\n",
"for prompt, output in zip(prompts, outputs):\n",
" print(\"===============================\")\n",
" print(\n",
" f\"Prompt: {prompt}\\nGenerated text: {output['text']}\\nPrompt_Tokens: {output['meta_info']['prompt_tokens']}\\tCompletion_tokens: {output['meta_info']['completion_tokens']}\\nHidden states: {[i.shape for i in output['meta_info']['hidden_states']]}\"\n",
" )\n",
" print()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SGLang Frontend Language"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"SGLang frontend language can be used to define simple and easy prompts in a convenient, structured way."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Launch A Server\n",
"\n",
"Launch the server in your terminal and wait for it to initialize."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import os\n",
"\n",
"from sglang import assistant_begin, assistant_end\n",
"from sglang import assistant, function, gen, system, user\n",
"from sglang import image\n",
"from sglang import RuntimeEndpoint, set_default_backend\n",
"from sglang.srt.utils import load_image\n",
"from sglang.test.test_utils import is_in_ci\n",
"from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
"\n",
"if is_in_ci():\n",
" from patch import launch_server_cmd\n",
"else:\n",
" from sglang.utils import launch_server_cmd\n",
"\n",
"\n",
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
"print(f\"Server started on http://localhost:{port}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the default backend. Note: Besides the local server, you may use also `OpenAI` or other API endpoints."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Usage\n",
"\n",
"The most simple way of using SGLang frontend language is a simple question answer dialog between a user and an assistant."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def basic_qa(s, question):\n",
" s += system(f\"You are a helpful assistant than can answer questions.\")\n",
" s += user(question)\n",
" s += assistant(gen(\"answer\", max_tokens=512))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"state = basic_qa(\"List 3 countries and their capitals.\")\n",
"print_highlight(state[\"answer\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Multi-turn Dialog\n",
"\n",
"SGLang frontend language can also be used to define multi-turn dialogs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def multi_turn_qa(s):\n",
" s += system(f\"You are a helpful assistant than can answer questions.\")\n",
" s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
" s += assistant(gen(\"first_answer\", max_tokens=512))\n",
" s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
" s += assistant(gen(\"second_answer\", max_tokens=512))\n",
" return s\n",
"\n",
"\n",
"state = multi_turn_qa()\n",
"print_highlight(state[\"first_answer\"])\n",
"print_highlight(state[\"second_answer\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Control flow\n",
"\n",
"You may use any Python code within the function to define more complex control flows."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def tool_use(s, question):\n",
" s += assistant(\n",
" \"To answer this question: \"\n",
" + question\n",
" + \". I need to use a \"\n",
" + gen(\"tool\", choices=[\"calculator\", \"search engine\"])\n",
" + \". \"\n",
" )\n",
"\n",
" if s[\"tool\"] == \"calculator\":\n",
" s += assistant(\"The math expression is: \" + gen(\"expression\"))\n",
" elif s[\"tool\"] == \"search engine\":\n",
" s += assistant(\"The key word to search is: \" + gen(\"word\"))\n",
"\n",
"\n",
"state = tool_use(\"What is 2 * 2?\")\n",
"print_highlight(state[\"tool\"])\n",
"print_highlight(state[\"expression\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parallelism\n",
"\n",
"Use `fork` to launch parallel prompts. Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def tip_suggestion(s):\n",
" s += assistant(\n",
" \"Here are two tips for staying healthy: \"\n",
" \"1. Balanced Diet. 2. Regular Exercise.\\n\\n\"\n",
" )\n",
"\n",
" forks = s.fork(2)\n",
" for i, f in enumerate(forks):\n",
" f += assistant(\n",
" f\"Now, expand tip {i+1} into a paragraph:\\n\"\n",
" + gen(\"detailed_tip\", max_tokens=256, stop=\"\\n\\n\")\n",
" )\n",
"\n",
" s += assistant(\"Tip 1:\" + forks[0][\"detailed_tip\"] + \"\\n\")\n",
" s += assistant(\"Tip 2:\" + forks[1][\"detailed_tip\"] + \"\\n\")\n",
" s += assistant(\n",
" \"To summarize the above two tips, I can say:\\n\" + gen(\"summary\", max_tokens=512)\n",
" )\n",
"\n",
"\n",
"state = tip_suggestion()\n",
"print_highlight(state[\"summary\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Constrained Decoding\n",
"\n",
"Use `regex` to specify a regular expression as a decoding constraint. This is only supported for local models."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def regular_expression_gen(s):\n",
" s += user(\"What is the IP address of the Google DNS servers?\")\n",
" s += assistant(\n",
" gen(\n",
" \"answer\",\n",
" temperature=0,\n",
" regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
" )\n",
" )\n",
"\n",
"\n",
"state = regular_expression_gen()\n",
"print_highlight(state[\"answer\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use `regex` to define a `JSON` decoding schema."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"character_regex = (\n",
" r\"\"\"\\{\\n\"\"\"\n",
" + r\"\"\" \"name\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
" + r\"\"\" \"house\": \"(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)\",\\n\"\"\"\n",
" + r\"\"\" \"blood status\": \"(Pure-blood|Half-blood|Muggle-born)\",\\n\"\"\"\n",
" + r\"\"\" \"occupation\": \"(student|teacher|auror|ministry of magic|death eater|order of the phoenix)\",\\n\"\"\"\n",
" + r\"\"\" \"wand\": \\{\\n\"\"\"\n",
" + r\"\"\" \"wood\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
" + r\"\"\" \"core\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
" + r\"\"\" \"length\": [0-9]{1,2}\\.[0-9]{0,2}\\n\"\"\"\n",
" + r\"\"\" \\},\\n\"\"\"\n",
" + r\"\"\" \"alive\": \"(Alive|Deceased)\",\\n\"\"\"\n",
" + r\"\"\" \"patronus\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
" + r\"\"\" \"bogart\": \"[\\w\\d\\s]{1,16}\"\\n\"\"\"\n",
" + r\"\"\"\\}\"\"\"\n",
")\n",
"\n",
"\n",
"@function\n",
"def character_gen(s, name):\n",
" s += user(\n",
" f\"{name} is a character in Harry Potter. Please fill in the following information about this character.\"\n",
" )\n",
" s += assistant(gen(\"json_output\", max_tokens=256, regex=character_regex))\n",
"\n",
"\n",
"state = character_gen(\"Harry Potter\")\n",
"print_highlight(state[\"json_output\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Batching \n",
"\n",
"Use `run_batch` to run a batch of prompts."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def text_qa(s, question):\n",
" s += user(question)\n",
" s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
"\n",
"\n",
"states = text_qa.run_batch(\n",
" [\n",
" {\"question\": \"What is the capital of the United Kingdom?\"},\n",
" {\"question\": \"What is the capital of France?\"},\n",
" {\"question\": \"What is the capital of Japan?\"},\n",
" ],\n",
" progress_bar=True,\n",
")\n",
"\n",
"for i, state in enumerate(states):\n",
" print_highlight(f\"Answer {i+1}: {states[i]['answer']}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Streaming \n",
"\n",
"Use `stream` to stream the output to the user."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def text_qa(s, question):\n",
" s += user(question)\n",
" s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
"\n",
"\n",
"state = text_qa.run(\n",
" question=\"What is the capital of France?\", temperature=0.1, stream=True\n",
")\n",
"\n",
"for out in state.text_iter():\n",
" print(out, end=\"\", flush=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Complex Prompts\n",
"\n",
"You may use `{system|user|assistant}_{begin|end}` to define complex prompts."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def chat_example(s):\n",
" s += system(\"You are a helpful assistant.\")\n",
" # Same as: s += s.system(\"You are a helpful assistant.\")\n",
"\n",
" with s.user():\n",
" s += \"Question: What is the capital of France?\"\n",
"\n",
" s += assistant_begin()\n",
" s += \"Answer: \" + gen(\"answer\", max_tokens=100, stop=\"\\n\")\n",
" s += assistant_end()\n",
"\n",
"\n",
"state = chat_example()\n",
"print_highlight(state[\"answer\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Multi-modal Generation\n",
"\n",
"You may use SGLang frontend language to define multi-modal prompts.\n",
"See [here](https://docs.sglang.ai/references/supported_models.html) for supported models."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"server_process, port = launch_server_cmd(\n",
" \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")\n",
"print(f\"Server started on http://localhost:{port}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ask a question about an image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"@function\n",
"def image_qa(s, image_file, question):\n",
" s += user(image(image_file) + question)\n",
" s += assistant(gen(\"answer\", max_tokens=256))\n",
"\n",
"\n",
"image_url = \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
"image_bytes, _ = load_image(image_url)\n",
"state = image_qa(image_bytes, \"What is in the image?\")\n",
"print_highlight(state[\"answer\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"terminate_process(server_process)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# Structured Generation Language
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may find it easier to use for complex prompting workflow.
## Quick Start
The example below shows how to use SGLang to answer a multi-turn question.
### Using Local Models
First, launch a server with
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
```
Then, connect to the server and answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(RuntimeEndpoint("http://localhost:30000"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print(state["answer_1"])
```
### Using OpenAI Models
Set the OpenAI API Key
```
export OPENAI_API_KEY=sk-******
```
Then, answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print(state["answer_1"])
```
### More Examples
Anthropic and VertexAI (Gemini) models are also supported.
You can find more examples at [examples/quick_start](https://github.com/sgl-project/sglang/tree/main/examples/frontend_language/quick_start).
## Language Feature
To begin with, import sglang.
```python
import sglang as sgl
```
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
You can implement your prompt flow in a function decorated by `sgl.function`.
You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, parallelism and batching for you.
The complete code for the examples below can be found at [readme_examples.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/readme_examples.py)
### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
```python
@sgl.function
def tool_use(s, question):
s += "To answer this question: " + question + ". "
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"]) + ". "
if s["tool"] == "calculator":
s += "The math expression is" + sgl.gen("expression")
elif s["tool"] == "search engine":
s += "The key word to search is" + sgl.gen("word")
```
### Parallelism
Use `fork` to launch parallel prompts.
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
```python
@sgl.function
def tip_suggestion(s):
s += (
"Here are two tips for staying healthy: "
"1. Balanced Diet. 2. Regular Exercise.\n\n"
)
forks = s.fork(2)
for i, f in enumerate(forks):
f += f"Now, expand tip {i+1} into a paragraph:\n"
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
s += "In summary" + sgl.gen("summary")
```
### Multi-Modality
Use `sgl.image` to pass an image as input.
```python
@sgl.function
def image_qa(s, image_file, question):
s += sgl.user(sgl.image(image_file) + question)
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
```
See also [local_example_llava_next.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/quick_start/local_example_llava_next.py).
### Constrained Decoding
Use `regex` to specify a regular expression as a decoding constraint.
This is only supported for local models.
```python
@sgl.function
def regular_expression_gen(s):
s += "Q: What is the IP address of the Google DNS servers?\n"
s += "A: " + sgl.gen(
"answer",
temperature=0,
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
)
```
### JSON Decoding
Use `regex` to specify a JSON schema with a regular expression.
```python
character_regex = (
r"""\{\n"""
+ r""" "name": "[\w\d\s]{1,16}",\n"""
+ r""" "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+ r""" "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+ r""" "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+ r""" "wand": \{\n"""
+ r""" "wood": "[\w\d\s]{1,16}",\n"""
+ r""" "core": "[\w\d\s]{1,16}",\n"""
+ r""" "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+ r""" \},\n"""
+ r""" "alive": "(Alive|Deceased)",\n"""
+ r""" "patronus": "[\w\d\s]{1,16}",\n"""
+ r""" "bogart": "[\w\d\s]{1,16}"\n"""
+ r"""\}"""
)
@sgl.function
def character_gen(s, name):
s += name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
```
See also [json_decode.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
### Batching
Use `run_batch` to run a batch of requests with continuous batching.
```python
@sgl.function
def text_qa(s, question):
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n")
states = text_qa.run_batch(
[
{"question": "What is the capital of the United Kingdom?"},
{"question": "What is the capital of France?"},
{"question": "What is the capital of Japan?"},
],
progress_bar=True
)
```
### Streaming
Add `stream=True` to enable streaming.
```python
@sgl.function
def text_qa(s, question):
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n")
state = text_qa.run(
question="What is the capital of France?",
temperature=0.1,
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
```
### Roles
Use `sgl.system``sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
```python
@sgl.function
def chat_example(s):
s += sgl.system("You are a helpful assistant.")
# Same as: s += s.system("You are a helpful assistant.")
with s.user():
s += "Question: What is the capital of France?"
s += sgl.assistant_begin()
s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
s += sgl.assistant_end()
```
### Tips and Implementation Details
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
import os
import weakref
from sglang.utils import execute_shell_command, reserve_port
DEFAULT_MAX_RUNNING_REQUESTS = 200
DEFAULT_MAX_TOTAL_TOKENS = 20480
import sglang.srt.server_args as server_args_mod
_original_post_init = server_args_mod.ServerArgs.__post_init__
def patched_post_init(self):
_original_post_init(self)
if self.max_running_requests is None:
self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
if self.max_total_tokens is None:
self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
self.disable_cuda_graph = True
server_args_mod.ServerArgs.__post_init__ = patched_post_init
process_socket_map = weakref.WeakKeyDictionary()
def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
"""
Launch the server using the given command.
If no port is specified, a free port is reserved.
"""
if port is None:
port, lock_socket = reserve_port(host)
else:
lock_socket = None
extra_flags = (
f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
f"--disable-cuda-graph"
)
full_command = f"{command} --port {port} {extra_flags}"
process = execute_shell_command(full_command)
if lock_socket is not None:
process_socket_map[process] = lock_socket
return process, port
...@@ -30,7 +30,6 @@ The core features include: ...@@ -30,7 +30,6 @@ The core features include:
backend/sampling_params.md backend/sampling_params.md
backend/hyperparameter_tuning.md backend/hyperparameter_tuning.md
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: Advanced Features :caption: Advanced Features
...@@ -45,7 +44,7 @@ The core features include: ...@@ -45,7 +44,7 @@ The core features include:
:maxdepth: 1 :maxdepth: 1
:caption: Frontend Tutorial :caption: Frontend Tutorial
frontend/frontend.md frontend/frontend.ipynb
frontend/choices_methods.md frontend/choices_methods.md
.. toctree:: .. toctree::
......
"""
Usage:
python hidden_states.py
Note that we are actively working on moving return_hidden_states to the sampling_params.
"""
import sglang as sgl
def main():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
llm = sgl.Engine(
model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
return_hidden_states=True,
)
sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10}
outputs = llm.generate(prompts, sampling_params=sampling_params)
for prompt, output in zip(prompts, outputs):
print("===============================")
print(
f"Prompt: {prompt}\nGenerated text: {output['text']}\nPrompt_Tokens: {output['meta_info']['prompt_tokens']}\tCompletion_tokens: {output['meta_info']['completion_tokens']}\nHidden states: {[i.shape for i in output['meta_info']['hidden_states']]}"
)
print()
# The __main__ condition is necessary here because we use "spawn" to create subprocesses
# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
if __name__ == "__main__":
main()
...@@ -5,56 +5,45 @@ python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct --c ...@@ -5,56 +5,45 @@ python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct --c
import argparse import argparse
import dataclasses import dataclasses
import io
import os
from transformers import AutoProcessor import requests
from PIL import Image
import sglang as sgl import sglang as sgl
from sglang.srt.openai_api.adapter import v1_chat_generate_request from sglang.srt.conversation import chat_templates
from sglang.srt.openai_api.protocol import ChatCompletionRequest
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs
def main( def main(
server_args: ServerArgs, server_args: ServerArgs,
): ):
# Create an LLM.
vlm = sgl.Engine(**dataclasses.asdict(server_args)) vlm = sgl.Engine(**dataclasses.asdict(server_args))
# prepare prompts. conv = chat_templates[server_args.chat_template].copy()
messages = [ image_token = conv.image_token
{
"role": "user", image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
"content": [
{"type": "text", "text": "What’s in this image?"}, prompt = f"What's in this image?\n{image_token}"
{
"type": "image_url",
"image_url": {
"url": "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true",
},
},
],
}
]
chat_request = ChatCompletionRequest(
messages=messages,
model=server_args.model_path,
temperature=0.8,
top_p=0.95,
)
gen_request, _ = v1_chat_generate_request(
[chat_request],
vlm.tokenizer_manager,
)
outputs = vlm.generate( sampling_params = {
input_ids=gen_request.input_ids, "temperature": 0.001,
image_data=gen_request.image_data, "max_new_tokens": 30,
sampling_params=gen_request.sampling_params, }
output = vlm.generate(
prompt=prompt,
image_data=image_url,
sampling_params=sampling_params,
) )
print("===============================") print("===============================")
print(f"Prompt: {messages[0]['content'][0]['text']}") print(f"Prompt: {prompt}")
print(f"Generated text: {outputs['text']}") print(f"Generated text: {output['text']}")
vlm.shutdown()
# The __main__ condition is necessary here because we use "spawn" to create subprocesses # The __main__ condition is necessary here because we use "spawn" to create subprocesses
...@@ -63,5 +52,6 @@ if __name__ == "__main__": ...@@ -63,5 +52,6 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
ServerArgs.add_cli_args(parser) ServerArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
server_args = ServerArgs.from_cli_args(args) server_args = ServerArgs.from_cli_args(args)
main(server_args) main(server_args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment