Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
06175286
Unverified
Commit
06175286
authored
Jan 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Jan 30, 2024
Browse files
Update quick start examples (#120)
parent
4ea92f83
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
555 additions
and
225 deletions
+555
-225
README.md
README.md
+15
-14
examples/quick_start/anthropic_example_chat.py
examples/quick_start/anthropic_example_chat.py
+61
-13
examples/quick_start/anthropic_example_complete.py
examples/quick_start/anthropic_example_complete.py
+49
-8
examples/quick_start/anthropic_example_stream.py
examples/quick_start/anthropic_example_stream.py
+0
-20
examples/quick_start/gemini_example_chat.py
examples/quick_start/gemini_example_chat.py
+67
-0
examples/quick_start/gemini_example_complete.py
examples/quick_start/gemini_example_complete.py
+49
-8
examples/quick_start/gemini_example_multimodal_chat.py
examples/quick_start/gemini_example_multimodal_chat.py
+23
-13
examples/quick_start/gemini_example_stream.py
examples/quick_start/gemini_example_stream.py
+0
-20
examples/quick_start/openai_example_chat.py
examples/quick_start/openai_example_chat.py
+62
-14
examples/quick_start/openai_example_complete.py
examples/quick_start/openai_example_complete.py
+49
-8
examples/quick_start/openai_example_stream.py
examples/quick_start/openai_example_stream.py
+0
-21
examples/quick_start/srt_example_chat.py
examples/quick_start/srt_example_chat.py
+60
-17
examples/quick_start/srt_example_complete.py
examples/quick_start/srt_example_complete.py
+50
-10
examples/quick_start/srt_example_llava.py
examples/quick_start/srt_example_llava.py
+50
-26
examples/quick_start/srt_example_stream.py
examples/quick_start/srt_example_stream.py
+0
-26
examples/usage/srt_example_regex.py
examples/usage/srt_example_regex.py
+0
-0
python/sglang/lang/interpreter.py
python/sglang/lang/interpreter.py
+4
-2
python/sglang/lang/ir.py
python/sglang/lang/ir.py
+3
-1
python/sglang/srt/models/qwen2.py
python/sglang/srt/models/qwen2.py
+2
-1
python/sglang/srt/server.py
python/sglang/srt/server.py
+11
-3
No files found.
README.md
View file @
06175286
...
...
@@ -39,18 +39,20 @@ pip install -e "python[all]"
-
For NVIDIA V100, please install the
[
nightly
](
https://triton-lang.org/main/getting-started/installation.html
)
version.
-
If you only need to use the OpenAI backend, you can avoid installing other dependencies by using
`pip install "sglang[openai]"`
## Quick Start
The example below shows how to use sglang to answer a mulit-turn question.
### Using
OpenAI
Models
Set the OpenAI API Key
### Using
Local
Models
First, launch a server with
```
export OPENAI_API_KEY=sk-******
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
Then, answer a multi-turn question.
Then, connect to the server and answer a multi-turn question.
```
python
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
RuntimeEndpoint
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
...
...
@@ -60,7 +62,7 @@ def multi_turn_question(s, question_1, question_2):
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo
"
))
set_default_backend
(
RuntimeEndpoint
(
"http://localhost:30000
"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
...
...
@@ -73,16 +75,15 @@ for m in state.messages():
print
(
state
[
"answer_1"
])
```
### Using
Local
Models
First, launch a server with
### Using
OpenAI
Models
Set the OpenAI API Key
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
export OPENAI_API_KEY=sk-******
```
Then, connect to the server and answer a multi-turn question.
Then, answer a multi-turn question.
```
python
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
RuntimeEndpoint
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
...
...
@@ -92,7 +93,7 @@ def multi_turn_question(s, question_1, question_2):
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
RuntimeEndpoint
(
"http://localhost:30000
"
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo
"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
...
...
@@ -120,7 +121,7 @@ import sglang as sgl
`sglang`
provides some simple primitives such as
`gen`
,
`select`
,
`fork`
,
`image`
.
You can implement your prompt flow in a function decorated by
`sgl.function`
.
You can then invoke the function with
`run`
or
`run_batch`
.
The system will manage the state, chat template,
and
parallelism for you.
The system will manage the state, chat template, parallelism
and batching
for you.
### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
...
...
examples/quick_start/anthropic_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
Anthropic
(
"claude-2"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
Anthropic
(
"claude-2"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/anthropic_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""
...
...
@@ -13,14 +19,49 @@ def few_shot_qa(s, question):
\n\n
Assistant: Rome
"""
)
s
+=
"
\n\n
Human: "
+
question
+
"
\n
"
s
+=
"
\n\n
Assistant:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"
\n\n
Assistant:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
Anthropic
(
"claude-2"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
Anthropic
(
"claude-2"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/anthropic_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Anthropic
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
Anthropic
(
"claude-2"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/gemini_example_chat.py
0 → 100644
View file @
06175286
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_chat.py
"""
import
sglang
as
sgl
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/gemini_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
VertexAI
(
"gemini-pro"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/gemini_example_multimodal_chat.py
View file @
06175286
from
sglang
import
function
,
user
,
assistant
,
gen
,
image
,
set_default_backend
,
VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_multimodal_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
image_qa
(
s
,
image_file1
,
image_file2
,
question
):
s
+=
user
(
image
(
image_file1
)
+
image
(
image_file2
)
+
question
)
s
+=
assistant
(
gen
(
"answer
_1
"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
sgl
.
image
(
image_file1
)
+
sgl
.
image
(
image_file2
)
+
question
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer"
,
max_tokens
=
256
))
set_default_backend
(
VertexAI
(
"gemini-pro-vision"
))
state
=
image_qa
.
run
(
image_file1
=
"./images/cat.jpeg"
,
image_file2
=
"./images/dog.jpeg"
,
question
=
"Describe difference of the 2 images in one sentence."
,
stream
=
True
)
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro-vision"
))
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
state
=
image_qa
.
run
(
image_file1
=
"./images/cat.jpeg"
,
image_file2
=
"./images/dog.jpeg"
,
question
=
"Describe difference of the two images in one sentence."
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
print
(
state
[
"answer"
])
examples/quick_start/gemini_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
user
,
assistant
,
gen
,
set_default_backend
,
VertexAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
VertexAI
(
"gemini-pro"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/openai_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
system
(
"You are a helpful assistant."
)
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/openai_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo-instruct"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo-instruct"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/openai_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/srt_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Runtime
"""
Usage:
python3 srt_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
runtime
=
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1")
set_default_backend
(
runtime
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
runtime
.
shutdown
()
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
Runtime
"""
Usage:
python3 srt_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,16 +17,52 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
runtime
=
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
set_default_backend
(
runtime
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
print
(
state
.
text
())
runtime
.
shutdown
()
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_llava.py
View file @
06175286
...
...
@@ -10,29 +10,53 @@ def image_qa(s, image_path, question):
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer"
))
runtime
=
sgl
.
Runtime
(
model_path
=
"liuhaotian/llava-v1.5-7b"
,
tokenizer_path
=
"llava-hf/llava-1.5-7b-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Single
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
)
print
(
state
[
"answer"
],
"
\n
"
)
# Batch
states
=
image_qa
.
run_batch
(
[
{
"image_path"
:
"images/cat.jpeg"
,
"question"
:
"What is this?"
},
{
"image_path"
:
"images/dog.jpeg"
,
"question"
:
"What is this?"
},
],
max_new_tokens
=
64
,
)
for
s
in
states
:
print
(
s
[
"answer"
],
"
\n
"
)
runtime
.
shutdown
()
def
single
():
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
)
print
(
state
[
"answer"
],
"
\n
"
)
def
stream
():
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
image_qa
.
run_batch
(
[
{
"image_path"
:
"images/cat.jpeg"
,
"question"
:
"What is this?"
},
{
"image_path"
:
"images/dog.jpeg"
,
"question"
:
"What is this?"
},
],
max_new_tokens
=
64
,
)
for
s
in
states
:
print
(
s
[
"answer"
],
"
\n
"
)
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"liuhaotian/llava-v1.5-7b"
,
tokenizer_path
=
"llava-hf/llava-1.5-7b-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Runtime
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
runtime
=
Runtime
(
"meta-llama/Llama-2-7b-chat-hf"
)
set_default_backend
(
runtime
)
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
temperature
=
0
,
stream
=
True
,
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
runtime
.
shutdown
()
examples/
quick_start
/srt_example_regex.py
→
examples/
usage
/srt_example_regex.py
View file @
06175286
File moved
python/sglang/lang/interpreter.py
View file @
06175286
...
...
@@ -651,7 +651,7 @@ class ProgramState:
def
sync
(
self
):
return
self
.
stream_executor
.
sync
()
def
text_iter
(
self
,
var_name
=
None
):
def
text_iter
(
self
,
var_name
:
Optional
[
str
]
=
None
):
if
self
.
stream_executor
.
stream
:
prev
=
0
if
var_name
is
None
:
...
...
@@ -682,7 +682,9 @@ class ProgramState:
else
:
yield
self
.
get_var
(
name
)
async
def
text_async_iter
(
self
,
var_name
=
None
,
return_meta_data
=
False
):
async
def
text_async_iter
(
self
,
var_name
:
Optional
[
str
]
=
None
,
return_meta_data
:
bool
=
False
):
loop
=
asyncio
.
get_running_loop
()
if
self
.
stream_executor
.
stream
:
...
...
python/sglang/lang/ir.py
View file @
06175286
...
...
@@ -74,7 +74,9 @@ class SglSamplingParams:
)
return
{
"max_tokens_to_sample"
:
self
.
max_new_tokens
,
"stop_sequences"
:
self
.
stop
,
"stop_sequences"
:
self
.
stop
if
isinstance
(
self
.
stop
,
(
list
,
tuple
))
else
[
self
.
stop
],
"temperature"
:
self
.
temperature
,
"top_p"
:
self
.
top_p
,
"top_k"
:
self
.
top_k
,
...
...
python/sglang/srt/models/qwen2.py
View file @
06175286
...
...
@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.managers.router.model_runner
import
InputMetadata
from
torch
import
nn
from
transformers
import
Qwen2Config
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator
,
)
Qwen2Config
=
None
class
Qwen2MLP
(
nn
.
Module
):
def
__init__
(
...
...
python/sglang/srt/server.py
View file @
06175286
...
...
@@ -445,18 +445,26 @@ class Runtime:
pipe_reader
,
pipe_writer
=
mp
.
Pipe
(
duplex
=
False
)
proc
=
mp
.
Process
(
target
=
launch_server
,
args
=
(
self
.
server_args
,
pipe_writer
))
proc
.
start
()
pipe_writer
.
close
()
self
.
pid
=
proc
.
pid
init_state
=
pipe_reader
.
recv
()
try
:
init_state
=
pipe_reader
.
recv
()
except
EOFError
:
init_state
=
""
if
init_state
!=
"init ok"
:
self
.
shutdown
()
raise
RuntimeError
(
"Launch failed"
)
raise
RuntimeError
(
"Launch failed
. Please see the error messages above.
"
)
self
.
endpoint
=
RuntimeEndpoint
(
self
.
url
)
def
shutdown
(
self
):
if
self
.
pid
is
not
None
:
parent
=
psutil
.
Process
(
self
.
pid
)
try
:
parent
=
psutil
.
Process
(
self
.
pid
)
except
psutil
.
NoSuchProcess
:
return
children
=
parent
.
children
(
recursive
=
True
)
for
child
in
children
:
child
.
kill
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment