Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
06175286
"docs/vscode:/vscode.git/clone" did not exist on "f00cd6efbd00b0273f58c393a617415b5d1d410e"
Unverified
Commit
06175286
authored
Jan 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Jan 30, 2024
Browse files
Update quick start examples (#120)
parent
4ea92f83
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
555 additions
and
225 deletions
+555
-225
README.md
README.md
+15
-14
examples/quick_start/anthropic_example_chat.py
examples/quick_start/anthropic_example_chat.py
+61
-13
examples/quick_start/anthropic_example_complete.py
examples/quick_start/anthropic_example_complete.py
+49
-8
examples/quick_start/anthropic_example_stream.py
examples/quick_start/anthropic_example_stream.py
+0
-20
examples/quick_start/gemini_example_chat.py
examples/quick_start/gemini_example_chat.py
+67
-0
examples/quick_start/gemini_example_complete.py
examples/quick_start/gemini_example_complete.py
+49
-8
examples/quick_start/gemini_example_multimodal_chat.py
examples/quick_start/gemini_example_multimodal_chat.py
+23
-13
examples/quick_start/gemini_example_stream.py
examples/quick_start/gemini_example_stream.py
+0
-20
examples/quick_start/openai_example_chat.py
examples/quick_start/openai_example_chat.py
+62
-14
examples/quick_start/openai_example_complete.py
examples/quick_start/openai_example_complete.py
+49
-8
examples/quick_start/openai_example_stream.py
examples/quick_start/openai_example_stream.py
+0
-21
examples/quick_start/srt_example_chat.py
examples/quick_start/srt_example_chat.py
+60
-17
examples/quick_start/srt_example_complete.py
examples/quick_start/srt_example_complete.py
+50
-10
examples/quick_start/srt_example_llava.py
examples/quick_start/srt_example_llava.py
+50
-26
examples/quick_start/srt_example_stream.py
examples/quick_start/srt_example_stream.py
+0
-26
examples/usage/srt_example_regex.py
examples/usage/srt_example_regex.py
+0
-0
python/sglang/lang/interpreter.py
python/sglang/lang/interpreter.py
+4
-2
python/sglang/lang/ir.py
python/sglang/lang/ir.py
+3
-1
python/sglang/srt/models/qwen2.py
python/sglang/srt/models/qwen2.py
+2
-1
python/sglang/srt/server.py
python/sglang/srt/server.py
+11
-3
No files found.
README.md
View file @
06175286
...
...
@@ -39,18 +39,20 @@ pip install -e "python[all]"
-
For NVIDIA V100, please install the
[
nightly
](
https://triton-lang.org/main/getting-started/installation.html
)
version.
-
If you only need to use the OpenAI backend, you can avoid installing other dependencies by using
`pip install "sglang[openai]"`
## Quick Start
The example below shows how to use sglang to answer a mulit-turn question.
### Using
OpenAI
Models
Set the OpenAI API Key
### Using
Local
Models
First, launch a server with
```
export OPENAI_API_KEY=sk-******
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
Then, answer a multi-turn question.
Then, connect to the server and answer a multi-turn question.
```
python
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
RuntimeEndpoint
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
...
...
@@ -60,7 +62,7 @@ def multi_turn_question(s, question_1, question_2):
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo
"
))
set_default_backend
(
RuntimeEndpoint
(
"http://localhost:30000
"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
...
...
@@ -73,16 +75,15 @@ for m in state.messages():
print
(
state
[
"answer_1"
])
```
### Using
Local
Models
First, launch a server with
### Using
OpenAI
Models
Set the OpenAI API Key
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
export OPENAI_API_KEY=sk-******
```
Then, connect to the server and answer a multi-turn question.
Then, answer a multi-turn question.
```
python
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
RuntimeEndpoint
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
...
...
@@ -92,7 +93,7 @@ def multi_turn_question(s, question_1, question_2):
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
RuntimeEndpoint
(
"http://localhost:30000
"
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo
"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
...
...
@@ -120,7 +121,7 @@ import sglang as sgl
`sglang`
provides some simple primitives such as
`gen`
,
`select`
,
`fork`
,
`image`
.
You can implement your prompt flow in a function decorated by
`sgl.function`
.
You can then invoke the function with
`run`
or
`run_batch`
.
The system will manage the state, chat template,
and
parallelism for you.
The system will manage the state, chat template, parallelism
and batching
for you.
### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
...
...
examples/quick_start/anthropic_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
Anthropic
(
"claude-2"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
Anthropic
(
"claude-2"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/anthropic_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""
...
...
@@ -13,14 +19,49 @@ def few_shot_qa(s, question):
\n\n
Assistant: Rome
"""
)
s
+=
"
\n\n
Human: "
+
question
+
"
\n
"
s
+=
"
\n\n
Assistant:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"
\n\n
Assistant:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
Anthropic
(
"claude-2"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
Anthropic
(
"claude-2"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/anthropic_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Anthropic
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
Anthropic
(
"claude-2"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/gemini_example_chat.py
0 → 100644
View file @
06175286
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_chat.py
"""
import
sglang
as
sgl
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/gemini_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
VertexAI
(
"gemini-pro"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/gemini_example_multimodal_chat.py
View file @
06175286
from
sglang
import
function
,
user
,
assistant
,
gen
,
image
,
set_default_backend
,
VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_multimodal_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
image_qa
(
s
,
image_file1
,
image_file2
,
question
):
s
+=
user
(
image
(
image_file1
)
+
image
(
image_file2
)
+
question
)
s
+=
assistant
(
gen
(
"answer
_1
"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
sgl
.
image
(
image_file1
)
+
sgl
.
image
(
image_file2
)
+
question
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer"
,
max_tokens
=
256
))
set_default_backend
(
VertexAI
(
"gemini-pro-vision"
))
state
=
image_qa
.
run
(
image_file1
=
"./images/cat.jpeg"
,
image_file2
=
"./images/dog.jpeg"
,
question
=
"Describe difference of the 2 images in one sentence."
,
stream
=
True
)
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
VertexAI
(
"gemini-pro-vision"
))
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
state
=
image_qa
.
run
(
image_file1
=
"./images/cat.jpeg"
,
image_file2
=
"./images/dog.jpeg"
,
question
=
"Describe difference of the two images in one sentence."
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
print
(
state
[
"answer"
])
examples/quick_start/gemini_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
user
,
assistant
,
gen
,
set_default_backend
,
VertexAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
VertexAI
(
"gemini-pro"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/openai_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
system
(
"You are a helpful assistant."
)
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo"
))
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/openai_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo-instruct"
))
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo-instruct"
))
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
print
(
state
.
text
())
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
examples/quick_start/openai_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
OpenAI
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
set_default_backend
(
OpenAI
(
"gpt-3.5-turbo"
))
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
examples/quick_start/srt_example_chat.py
View file @
06175286
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Runtime
"""
Usage:
python3 srt_example_chat.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_1
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
sgl
.
user
(
question_2
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer_2"
,
max_tokens
=
256
))
runtime
=
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1")
set_default_backend
(
runtime
)
def
single
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
)
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
for
m
in
state
.
messages
():
print
(
m
[
"role"
],
":"
,
m
[
"content"
])
print
(
"answer_1"
,
state
[
"answer_1"
])
runtime
.
shutdown
()
def
stream
():
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
multi_turn_question
.
run_batch
([
{
"question_1"
:
"What is the capital of the United States?"
,
"question_2"
:
"List two local attractions."
},
{
"question_1"
:
"What is the capital of France?"
,
"question_2"
:
"What is the population of this city?"
},
])
for
s
in
states
:
print
(
s
.
messages
())
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_complete.py
View file @
06175286
from
sglang
import
function
,
gen
,
set_default_backend
,
Runtime
"""
Usage:
python3 srt_example_complete.py
"""
import
sglang
as
sgl
@
function
@
sgl
.
function
def
few_shot_qa
(
s
,
question
):
s
+=
(
"""The following are questions with answers.
...
...
@@ -13,16 +17,52 @@ Q: What is the capital of Italy?
A: Rome
"""
)
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
,
temperature
=
0
)
runtime
=
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
set_default_backend
(
runtime
)
def
single
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
answer
=
state
[
"answer"
].
strip
().
lower
()
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
)
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
answer
=
state
[
"answer"
].
strip
().
lower
()
assert
"washington"
in
answer
,
f
"answer:
{
state
[
'answer'
]
}
"
print
(
state
.
text
())
print
(
state
.
text
())
runtime
.
shutdown
()
def
stream
():
state
=
few_shot_qa
.
run
(
question
=
"What is the capital of the United States?"
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
few_shot_qa
.
run_batch
([
{
"question"
:
"What is the capital of the United States?"
},
{
"question"
:
"What is the capital of China?"
},
])
for
s
in
states
:
print
(
s
[
"answer"
])
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_llava.py
View file @
06175286
...
...
@@ -10,29 +10,53 @@ def image_qa(s, image_path, question):
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer"
))
runtime
=
sgl
.
Runtime
(
model_path
=
"liuhaotian/llava-v1.5-7b"
,
tokenizer_path
=
"llava-hf/llava-1.5-7b-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Single
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
)
print
(
state
[
"answer"
],
"
\n
"
)
# Batch
states
=
image_qa
.
run_batch
(
[
{
"image_path"
:
"images/cat.jpeg"
,
"question"
:
"What is this?"
},
{
"image_path"
:
"images/dog.jpeg"
,
"question"
:
"What is this?"
},
],
max_new_tokens
=
64
,
)
for
s
in
states
:
print
(
s
[
"answer"
],
"
\n
"
)
runtime
.
shutdown
()
def
single
():
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
)
print
(
state
[
"answer"
],
"
\n
"
)
def
stream
():
state
=
image_qa
.
run
(
image_path
=
"images/cat.jpeg"
,
question
=
"What is this?"
,
max_new_tokens
=
64
,
stream
=
True
)
for
out
in
state
.
text_iter
(
"answer"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
def
batch
():
states
=
image_qa
.
run_batch
(
[
{
"image_path"
:
"images/cat.jpeg"
,
"question"
:
"What is this?"
},
{
"image_path"
:
"images/dog.jpeg"
,
"question"
:
"What is this?"
},
],
max_new_tokens
=
64
,
)
for
s
in
states
:
print
(
s
[
"answer"
],
"
\n
"
)
if
__name__
==
"__main__"
:
runtime
=
sgl
.
Runtime
(
model_path
=
"liuhaotian/llava-v1.5-7b"
,
tokenizer_path
=
"llava-hf/llava-1.5-7b-hf"
)
sgl
.
set_default_backend
(
runtime
)
# Run a single request
print
(
"
\n
========== single ==========
\n
"
)
single
()
# Stream output
print
(
"
\n
========== stream ==========
\n
"
)
stream
()
# Run a batch of requests
print
(
"
\n
========== batch ==========
\n
"
)
batch
()
runtime
.
shutdown
()
examples/quick_start/srt_example_stream.py
deleted
100644 → 0
View file @
4ea92f83
from
sglang
import
function
,
system
,
user
,
assistant
,
gen
,
set_default_backend
,
Runtime
@
function
def
multi_turn_question
(
s
,
question_1
,
question_2
):
s
+=
system
(
"You are a helpful assistant."
)
s
+=
user
(
question_1
)
s
+=
assistant
(
gen
(
"answer_1"
,
max_tokens
=
256
))
s
+=
user
(
question_2
)
s
+=
assistant
(
gen
(
"answer_2"
,
max_tokens
=
256
))
runtime
=
Runtime
(
"meta-llama/Llama-2-7b-chat-hf"
)
set_default_backend
(
runtime
)
state
=
multi_turn_question
.
run
(
question_1
=
"What is the capital of the United States?"
,
question_2
=
"List two local attractions."
,
temperature
=
0
,
stream
=
True
,
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
runtime
.
shutdown
()
examples/
quick_start
/srt_example_regex.py
→
examples/
usage
/srt_example_regex.py
View file @
06175286
File moved
python/sglang/lang/interpreter.py
View file @
06175286
...
...
@@ -651,7 +651,7 @@ class ProgramState:
def
sync
(
self
):
return
self
.
stream_executor
.
sync
()
def
text_iter
(
self
,
var_name
=
None
):
def
text_iter
(
self
,
var_name
:
Optional
[
str
]
=
None
):
if
self
.
stream_executor
.
stream
:
prev
=
0
if
var_name
is
None
:
...
...
@@ -682,7 +682,9 @@ class ProgramState:
else
:
yield
self
.
get_var
(
name
)
async
def
text_async_iter
(
self
,
var_name
=
None
,
return_meta_data
=
False
):
async
def
text_async_iter
(
self
,
var_name
:
Optional
[
str
]
=
None
,
return_meta_data
:
bool
=
False
):
loop
=
asyncio
.
get_running_loop
()
if
self
.
stream_executor
.
stream
:
...
...
python/sglang/lang/ir.py
View file @
06175286
...
...
@@ -74,7 +74,9 @@ class SglSamplingParams:
)
return
{
"max_tokens_to_sample"
:
self
.
max_new_tokens
,
"stop_sequences"
:
self
.
stop
,
"stop_sequences"
:
self
.
stop
if
isinstance
(
self
.
stop
,
(
list
,
tuple
))
else
[
self
.
stop
],
"temperature"
:
self
.
temperature
,
"top_p"
:
self
.
top_p
,
"top_k"
:
self
.
top_k
,
...
...
python/sglang/srt/models/qwen2.py
View file @
06175286
...
...
@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.managers.router.model_runner
import
InputMetadata
from
torch
import
nn
from
transformers
import
Qwen2Config
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
...
...
@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator
,
)
Qwen2Config
=
None
class
Qwen2MLP
(
nn
.
Module
):
def
__init__
(
...
...
python/sglang/srt/server.py
View file @
06175286
...
...
@@ -445,18 +445,26 @@ class Runtime:
pipe_reader
,
pipe_writer
=
mp
.
Pipe
(
duplex
=
False
)
proc
=
mp
.
Process
(
target
=
launch_server
,
args
=
(
self
.
server_args
,
pipe_writer
))
proc
.
start
()
pipe_writer
.
close
()
self
.
pid
=
proc
.
pid
init_state
=
pipe_reader
.
recv
()
try
:
init_state
=
pipe_reader
.
recv
()
except
EOFError
:
init_state
=
""
if
init_state
!=
"init ok"
:
self
.
shutdown
()
raise
RuntimeError
(
"Launch failed"
)
raise
RuntimeError
(
"Launch failed
. Please see the error messages above.
"
)
self
.
endpoint
=
RuntimeEndpoint
(
self
.
url
)
def
shutdown
(
self
):
if
self
.
pid
is
not
None
:
parent
=
psutil
.
Process
(
self
.
pid
)
try
:
parent
=
psutil
.
Process
(
self
.
pid
)
except
psutil
.
NoSuchProcess
:
return
children
=
parent
.
children
(
recursive
=
True
)
for
child
in
children
:
child
.
kill
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment