Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c4707f1b
"src/turbomind/utils/memory_utils.h" did not exist on "720fc533da804ac3f46ee938864403e51fcd9fa7"
Unverified
Commit
c4707f1b
authored
Jan 16, 2024
by
Lianmin Zheng
Committed by
GitHub
Jan 16, 2024
Browse files
Improve docs (#17)
parent
ffe4aaee
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
24 additions
and
12 deletions
+24
-12
README.md
README.md
+18
-3
examples/usage/readme_examples.py
examples/usage/readme_examples.py
+4
-1
python/sglang/lang/interpreter.py
python/sglang/lang/interpreter.py
+1
-5
python/sglang/srt/layers/context_flashattention_nopad.py
python/sglang/srt/layers/context_flashattention_nopad.py
+0
-1
python/sglang/srt/layers/extend_attention.py
python/sglang/srt/layers/extend_attention.py
+0
-1
python/sglang/srt/managers/router/model_rpc.py
python/sglang/srt/managers/router/model_rpc.py
+1
-1
No files found.
README.md
View file @
c4707f1b
...
...
@@ -115,13 +115,14 @@ You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, and parallelism for you.
### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
```
python
@
sgl
.
function
def
control_flow
(
s
,
question
):
s
+=
"To answer this question: "
+
question
+
", "
s
+=
"I need to use a "
+
sgl
.
gen
(
"tool"
,
choices
=
[
"calculator"
,
"web browser"
])
+
". "
# You can use if or nested function calls
if
s
[
"tool"
]
==
"calculator"
:
s
+=
"The math expression is"
+
sgl
.
gen
(
"expression"
)
elif
s
[
"tool"
]
==
"web browser"
:
...
...
@@ -129,6 +130,9 @@ def control_flow(s, question):
```
### Parallelism
Use
`fork`
to launch parallel prompts.
Because
`sgl.gen`
is non-blocking, the for loop below issues two generation calls in parallel.
```
python
@
sgl
.
function
def
tip_suggestion
(
s
):
...
...
@@ -137,7 +141,7 @@ def tip_suggestion(s):
"1. Balanced Diet. 2. Regular Exercise.
\n\n
"
)
forks
=
s
.
fork
(
2
)
# Launch parallel prompts
forks
=
s
.
fork
(
2
)
for
i
,
f
in
enumerate
(
forks
):
f
+=
f
"Now, expand tip
{
i
+
1
}
into a paragraph:
\n
"
f
+=
sgl
.
gen
(
f
"detailed_tip"
,
max_tokens
=
256
,
stop
=
"
\n\n
"
)
...
...
@@ -148,6 +152,8 @@ def tip_suggestion(s):
```
### Multi Modality
Use
`sgl.image`
to pass an image as input.
```
python
@
sgl
.
function
def
image_qa
(
s
,
image_file
,
question
):
...
...
@@ -156,6 +162,8 @@ def image_qa(s, image_file, question):
```
### Constrained Decoding
Use
`regex=`
to specify a regular expression as a decoding constraint.
```
python
@
sgl
.
function
def
regular_expression_gen
(
s
):
...
...
@@ -168,6 +176,8 @@ def regular_expression_gen(s):
```
### Batching
Use
`run_batch`
to run a batch of requests with continuous batching.
```
python
@
sgl
.
function
def
text_qa
(
s
,
question
):
...
...
@@ -180,10 +190,13 @@ states = text_qa.run_batch(
{
"question"
:
"What is the capital of France?"
},
{
"question"
:
"What is the capital of Japan?"
},
],
progress_bar
=
True
)
```
### Streaming
Add
`stream=True`
to enable streaming.
```
python
@
sgl
.
function
def
text_qa
(
s
,
question
):
...
...
@@ -192,7 +205,9 @@ def text_qa(s, question):
states
=
text_qa
.
run
(
question
=
"What is the capital of France?"
,
temperature
=
0.1
)
temperature
=
0.1
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
...
...
examples/usage/readme_examples.py
View file @
c4707f1b
...
...
@@ -53,6 +53,7 @@ def driver_batching():
{
"question"
:
"What is the capital of France?"
},
{
"question"
:
"What is the capital of Japan?"
},
],
progress_bar
=
True
)
for
s
in
states
:
...
...
@@ -63,7 +64,9 @@ def driver_batching():
def
driver_stream
():
state
=
text_qa
.
run
(
question
=
"What is the capital of France?"
,
temperature
=
0.1
)
temperature
=
0.1
,
stream
=
True
)
for
out
in
state
.
text_iter
():
print
(
out
,
end
=
""
,
flush
=
True
)
...
...
python/sglang/lang/interpreter.py
View file @
c4707f1b
...
...
@@ -632,11 +632,7 @@ class ProgramState:
self
.
stream_executor
.
end
()
def
__repr__
(
self
)
->
str
:
msgs
=
self
.
messages
()
ret
=
""
for
msg
in
msgs
:
ret
+=
msg
[
"role"
]
+
":
\n
"
+
msg
[
"content"
]
+
"
\n
"
return
ret
return
f
"ProgramState(
{
self
.
text
()
}
)"
class
ProgramStateGroup
:
...
...
python/sglang/srt/layers/context_flashattention_nopad.py
View file @
c4707f1b
...
...
@@ -5,7 +5,6 @@ import triton
import
triton.language
as
tl
from
sglang.srt.utils
import
wrap_kernel_launcher
CUDA_CAPABILITY
=
torch
.
cuda
.
get_device_capability
()
...
...
python/sglang/srt/layers/extend_attention.py
View file @
c4707f1b
...
...
@@ -4,7 +4,6 @@ import triton.language as tl
from
sglang.srt.layers.context_flashattention_nopad
import
context_attention_fwd
from
sglang.srt.utils
import
wrap_kernel_launcher
CUDA_CAPABILITY
=
torch
.
cuda
.
get_device_capability
()
...
...
python/sglang/srt/managers/router/model_rpc.py
View file @
c4707f1b
...
...
@@ -2,10 +2,10 @@ import asyncio
import
logging
import
multiprocessing
import
time
import
warnings
from
concurrent.futures
import
ThreadPoolExecutor
from
enum
import
Enum
,
auto
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
warnings
import
numpy
as
np
import
rpyc
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment