Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
46b7ea7c
Commit
46b7ea7c
authored
Jan 15, 2024
by
Lianmin Zheng
Browse files
Improve Readme (#10)
parent
70359bf3
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
167 additions
and
7 deletions
+167
-7
README.md
README.md
+85
-3
assets/llama_7b.jpg
assets/llama_7b.jpg
+0
-0
assets/mixtral_8x7b.jpg
assets/mixtral_8x7b.jpg
+0
-0
examples/usage/readme_examples.py
examples/usage/readme_examples.py
+79
-0
examples/usage/streaming.py
examples/usage/streaming.py
+3
-4
No files found.
README.md
View file @
46b7ea7c
...
...
@@ -94,25 +94,99 @@ You can find more examples at [examples/quick_start](examples/quick_start).
## Frontend: Structured Generation Langauge (SGLang)
To begin with, import sglang.
```
python
import
sglang
as
sgl
```
`sglang`
provides some simple primitives such as
`gen`
,
`select`
,
`fork`
.
You can implement your prompt flow in a function decorated by
`sgl.function`
.
You can then invoke the function with
`run`
or
`run_batch`
.
The system will manage the state, chat template, and parallelism for you.
### Control Flow
```
python
@
sgl
.
function
def
control_flow
(
s
,
question
):
s
+=
"To answer this question: "
+
question
+
", "
s
+=
"I need to use a "
+
sgl
.
gen
(
"tool"
,
choices
=
[
"calculator"
,
"web browser"
])
+
". "
# You can use if or nested function calls
if
s
[
"tool"
]
==
"calculator"
:
s
+=
"The math expression is"
+
sgl
.
gen
(
"expression"
)
elif
s
[
"tool"
]
==
"web browser"
:
s
+=
"The website url is"
+
sgl
.
gen
(
"url"
)
```
### Parallelism
```
python
@
sgl
.
function
def
tip_suggestion
(
s
):
s
+=
(
"Here are two tips for staying healthy: "
"1. Balanced Diet. 2. Regular Exercise.
\n\n
"
)
forks
=
s
.
fork
(
2
)
# Launch parallel prompts
for
i
,
f
in
enumerate
(
forks
):
f
+=
f
"Now, expand tip
{
i
+
1
}
into a paragraph:
\n
"
f
+=
sgl
.
gen
(
f
"detailed_tip"
,
max_tokens
=
256
,
stop
=
"
\n\n
"
)
s
+=
"Tip 1:"
+
forks
[
0
][
"detailed_tip"
]
+
"
\n
"
s
+=
"Tip 2:"
+
forks
[
1
][
"detailed_tip"
]
+
"
\n
"
s
+=
"In summary"
+
sgl
.
gen
(
"summary"
)
```
### Multi Modality
```
python
@
sgl
.
function
def
image_qa
(
s
,
image_file
,
question
):
s
+=
sgl
.
user
(
sgl
.
image
(
image_file
)
+
question
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer
_1
"
,
max_tokens
=
256
)
)
s
+=
sgl
.
assistant
(
sgl
.
gen
(
"answer"
,
max_tokens
=
256
)
```
### Constrained decoding
### Constrained Decoding
```
python
@
function
def
regular_expression_gen
(
s
):
s
+=
"Q: What is the IP address of the Google DNS servers?
\n
"
s
+=
"A: "
+
gen
(
"answer"
,
temperature
=
0
,
regex
=
r
"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
,
)
```
### Batching
```
python
@
sgl
.
function
def
text_qa
(
s
,
question
):
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
)
states
=
text_qa
.
run_batch
(
[
{
"question"
:
"What is the capital of the United Kingdom?"
},
{
"question"
:
"What is the capital of France?"
},
{
"question"
:
"What is the capital of Japan?"
},
],
)
```
### Streaming
```
python
@
sgl
.
function
def
text_qa
(
s
,
question
):
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
)
states
=
text_qa
.
run
(
question
=
"What is the capital of France?"
,
temperature
=
0.1
)
### Other Backends
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
```
## Backend: SGLang Runtime (SRT)
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
...
...
@@ -151,6 +225,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
## Benchmark And Performance
-
Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1

-
Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8

Learn more
[
here
](
).
## Roadmap
-
[ ] Function call
-
[ ] Quantization
...
...
assets/llama_7b.jpg
0 → 100644
View file @
46b7ea7c
231 KB
assets/mixtral_8x7b.jpg
0 → 100644
View file @
46b7ea7c
157 KB
examples/usage/readme_examples.py
0 → 100644
View file @
46b7ea7c
import
sglang
as
sgl
@
sgl
.
function
def
tool_use
(
s
,
question
):
s
+=
"To answer this question: "
+
question
+
", "
s
+=
"I need to use a "
+
sgl
.
gen
(
"tool"
,
choices
=
[
"calculator"
,
"web browser"
])
+
". "
if
s
[
"tool"
]
==
"calculator"
:
s
+=
"The math expression is"
+
sgl
.
gen
(
"expression"
)
elif
s
[
"tool"
]
==
"web browser"
:
s
+=
"The website url is"
+
sgl
.
gen
(
"url"
)
@
sgl
.
function
def
tip_suggestion
(
s
):
s
+=
(
"Here are two tips for staying healthy: "
"1. Balanced Diet. 2. Regular Exercise.
\n\n
"
)
forks
=
s
.
fork
(
2
)
for
i
,
f
in
enumerate
(
forks
):
f
+=
f
"Now, expand tip
{
i
+
1
}
into a paragraph:
\n
"
f
+=
sgl
.
gen
(
f
"detailed_tip"
,
max_tokens
=
256
,
stop
=
"
\n\n
"
)
s
+=
"Tip 1:"
+
forks
[
0
][
"detailed_tip"
]
+
"
\n
"
s
+=
"Tip 2:"
+
forks
[
1
][
"detailed_tip"
]
+
"
\n
"
s
+=
"In summary"
+
sgl
.
gen
(
"summary"
)
@
sgl
.
function
def
text_qa
(
s
,
question
):
s
+=
"Q: "
+
question
+
"
\n
"
s
+=
"A:"
+
sgl
.
gen
(
"answer"
,
stop
=
"
\n
"
)
def
driver_tool_use
():
state
=
tool_use
.
run
(
question
=
"What is the capital of the United States?"
)
print
(
state
.
text
())
print
(
"
\n
"
)
def
driver_tip_suggestion
():
state
=
tip_suggestion
.
run
()
print
(
state
.
text
())
print
(
"
\n
"
)
def
driver_batching
():
states
=
text_qa
.
run_batch
(
[
{
"question"
:
"What is the capital of the United Kingdom?"
},
{
"question"
:
"What is the capital of France?"
},
{
"question"
:
"What is the capital of Japan?"
},
],
)
for
s
in
states
:
print
(
s
.
text
())
print
(
"
\n
"
)
def
driver_stream
():
state
=
text_qa
.
run
(
question
=
"What is the capital of France?"
,
temperature
=
0.1
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
print
(
"
\n
"
)
if
__name__
==
"__main__"
:
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo-instruct"
))
driver_tool_use
()
driver_tip_suggestion
()
driver_batching
()
driver_stream
()
examples/
quick_start/more_stream_methods
.py
→
examples/
usage/streaming
.py
View file @
46b7ea7c
...
...
@@ -12,7 +12,6 @@ def multi_turn_question(s, question_1, question_2):
sgl
.
set_default_backend
(
sgl
.
OpenAI
(
"gpt-3.5-turbo"
))
#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
def
stream_a_variable
():
...
...
@@ -24,7 +23,7 @@ def stream_a_variable():
for
out
in
state
.
text_iter
(
var_name
=
"answer_2"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
print
(
"
\n
"
)
async
def
async_stream
():
...
...
@@ -36,9 +35,9 @@ async def async_stream():
async
for
out
in
state
.
text_async_iter
(
var_name
=
"answer_2"
):
print
(
out
,
end
=
""
,
flush
=
True
)
print
()
print
(
"
\n
"
)
if
__name__
==
"__main__"
:
#
stream_a_variable()
stream_a_variable
()
asyncio
.
run
(
async_stream
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment