Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b548801d
Unverified
Commit
b548801d
authored
Oct 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Oct 30, 2024
Browse files
Update docs (#1839)
parent
539df95d
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
165 additions
and
198 deletions
+165
-198
docs/deploy.py
docs/deploy.py
+1
-1
docs/deploy_docs.sh
docs/deploy_docs.sh
+0
-0
docs/openai_api.ipynb
docs/openai_api.ipynb
+92
-75
docs/send_request.ipynb
docs/send_request.ipynb
+51
-80
python/sglang/srt/mem_cache/flush_cache.py
python/sglang/srt/mem_cache/flush_cache.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+1
-1
python/sglang/srt/server.py
python/sglang/srt/server.py
+2
-2
python/sglang/utils.py
python/sglang/utils.py
+7
-37
scripts/ci_install_dependency.sh
scripts/ci_install_dependency.sh
+4
-0
scripts/killall_sglang.sh
scripts/killall_sglang.sh
+4
-0
scripts/version_branch_to_tag.sh
scripts/version_branch_to_tag.sh
+2
-1
No files found.
docs/deploy.py
View file @
b548801d
#
!/usr/bin/python3
#
Deploy the documents
import
os
import
os
from
datetime
import
datetime
from
datetime
import
datetime
...
...
docs/deploy_docs.sh
deleted
100644 → 0
View file @
539df95d
docs/openai_api.ipynb
View file @
b548801d
This diff is collapsed.
Click to expand it.
docs/send_request.ipynb
View file @
b548801d
This diff is collapsed.
Click to expand it.
python/sglang/srt/mem_cache/flush_cache.py
View file @
b548801d
...
@@ -29,5 +29,5 @@ if __name__ == "__main__":
...
@@ -29,5 +29,5 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--url"
,
type
=
str
,
default
=
"http://localhost:30000"
)
parser
.
add_argument
(
"--url"
,
type
=
str
,
default
=
"http://localhost:30000"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
response
=
requests
.
ge
t
(
args
.
url
+
"/flush_cache"
)
response
=
requests
.
pos
t
(
args
.
url
+
"/flush_cache"
)
assert
response
.
status_code
==
200
assert
response
.
status_code
==
200
python/sglang/srt/model_executor/model_runner.py
View file @
b548801d
...
@@ -124,7 +124,7 @@ class ModelRunner:
...
@@ -124,7 +124,7 @@ class ModelRunner:
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
)
server_args
.
chunked_prefill_size
=
None
server_args
.
chunked_prefill_size
=
None
se
rver_args
.
mem_fraction_static
*=
0.95
se
lf
.
mem_fraction_static
*=
0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if
self
.
model_config
.
hf_config
.
architectures
==
[
if
self
.
model_config
.
hf_config
.
architectures
==
[
"Qwen2VLForConditionalGeneration"
"Qwen2VLForConditionalGeneration"
...
...
python/sglang/srt/server.py
View file @
b548801d
...
@@ -139,7 +139,7 @@ async def get_server_args():
...
@@ -139,7 +139,7 @@ async def get_server_args():
return
dataclasses
.
asdict
(
tokenizer_manager
.
server_args
)
return
dataclasses
.
asdict
(
tokenizer_manager
.
server_args
)
@
app
.
ge
t
(
"/flush_cache"
)
@
app
.
pos
t
(
"/flush_cache"
)
async
def
flush_cache
():
async
def
flush_cache
():
"""Flush the radix cache."""
"""Flush the radix cache."""
tokenizer_manager
.
flush_cache
()
tokenizer_manager
.
flush_cache
()
...
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
...
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
return
ret
return
ret
except
Exception
as
e
:
except
Exception
as
e
:
return
JSONResponse
(
return
OR
JSONResponse
(
{
"error"
:
{
"message"
:
str
(
e
)}},
status_code
=
HTTPStatus
.
BAD_REQUEST
{
"error"
:
{
"message"
:
str
(
e
)}},
status_code
=
HTTPStatus
.
BAD_REQUEST
)
)
...
...
python/sglang/utils.py
View file @
b548801d
...
@@ -19,7 +19,6 @@ from typing import Optional, Union
...
@@ -19,7 +19,6 @@ from typing import Optional, Union
import
numpy
as
np
import
numpy
as
np
import
requests
import
requests
import
torch
from
IPython.display
import
HTML
,
display
from
IPython.display
import
HTML
,
display
from
tqdm
import
tqdm
from
tqdm
import
tqdm
...
@@ -332,13 +331,12 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
...
@@ -332,13 +331,12 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
headers
=
{
"Authorization"
:
"Bearer None"
},
headers
=
{
"Authorization"
:
"Bearer None"
},
)
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
time
.
sleep
(
5
)
print_highlight
(
print_highlight
(
"""
"""
\n
Server and notebook outputs are combined for clarity.
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
Typically, the server runs in a separate terminal.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
Server output is gray; notebook output is highlighted.
"""
"""
)
)
break
break
...
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
...
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
def
terminate_process
(
process
):
def
terminate_process
(
process
):
"""Safely terminate a process and clean up GPU memory.
from
sglang.srt.utils
import
kill_child_process
kill_child_process
(
process
.
pid
,
include_self
=
True
)
Args:
process: subprocess.Popen object to terminate
"""
try
:
process
.
terminate
()
try
:
process
.
wait
(
timeout
=
5
)
except
subprocess
.
TimeoutExpired
:
if
os
.
name
!=
"nt"
:
try
:
pgid
=
os
.
getpgid
(
process
.
pid
)
os
.
killpg
(
pgid
,
signal
.
SIGTERM
)
time
.
sleep
(
1
)
if
process
.
poll
()
is
None
:
os
.
killpg
(
pgid
,
signal
.
SIGKILL
)
except
ProcessLookupError
:
pass
else
:
process
.
kill
()
process
.
wait
()
except
Exception
as
e
:
print
(
f
"Warning:
{
e
}
"
)
finally
:
gc
.
collect
()
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
time
.
sleep
(
2
)
def
print_highlight
(
html_content
:
str
):
def
print_highlight
(
html_content
:
str
):
...
...
scripts/ci_install_dependency.sh
View file @
b548801d
"""
Install the dependency in CI.
"""
pip
install
--upgrade
pip
pip
install
--upgrade
pip
pip
install
-e
"python[all]"
pip
install
-e
"python[all]"
pip
install
transformers
==
4.45.2
pip
install
transformers
==
4.45.2
...
...
scripts/killall_sglang.sh
View file @
b548801d
"""
Kill all SGLang processes and free the GPU memory.
"""
kill
-9
$(
ps aux |
grep
'multiprocessing.spawn'
|
grep
-v
'grep'
|
awk
'{print $2}'
)
kill
-9
$(
ps aux |
grep
'multiprocessing.spawn'
|
grep
-v
'grep'
|
awk
'{print $2}'
)
kill
-9
$(
ps aux |
grep
'sglang.launch_server'
|
grep
-v
'grep'
|
awk
'{print $2}'
)
kill
-9
$(
ps aux |
grep
'sglang.launch_server'
|
grep
-v
'grep'
|
awk
'{print $2}'
)
scripts/version_branch_to_tag.sh
View file @
b548801d
#!/bin/bash
#!/bin/bash
# This script tags all remote branches starting with 'v' with the same name as the branch,
# This script is used for release.
# It tags all remote branches starting with 'v' with the same name as the branch,
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
git fetch origin
--prune
git fetch origin
--prune
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment