Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
79ece2c5
"tests/vscode:/vscode.git/clone" did not exist on "a7e9f85e21dde12f2f2489702ea82db40ebb31d2"
Unverified
Commit
79ece2c5
authored
Aug 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Aug 30, 2024
Browse files
Report median instead of mean in bench_latency.py (#1269)
parent
55f5976b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
14 deletions
+8
-14
README.md
README.md
+1
-1
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+7
-5
python/sglang/srt/hf_transformers_utils.py
python/sglang/srt/hf_transformers_utils.py
+0
-8
No files found.
README.md
View file @
79ece2c5
...
@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
...
@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
### Supported Models
### Supported Models
**Generative Models**
**Generative Models**
-
Exaone 3.0
-
Llama / Llama 2 / Llama 3 / Llama 3.1
-
Llama / Llama 2 / Llama 3 / Llama 3.1
-
Mistral / Mixtral / Mistral NeMo
-
Mistral / Mixtral / Mistral NeMo
-
Gemma / Gemma 2
-
Gemma / Gemma 2
...
@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
...
@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
-
Grok
-
Grok
-
ChatGLM
-
ChatGLM
-
InternLM 2
-
InternLM 2
-
Exaone 3
**Embedding Models**
**Embedding Models**
...
...
python/sglang/bench_latency.py
View file @
79ece2c5
...
@@ -292,6 +292,7 @@ def latency_test_run_once(
...
@@ -292,6 +292,7 @@ def latency_test_run_once(
measurement_results
[
"prefill_throughput"
]
=
throughput
measurement_results
[
"prefill_throughput"
]
=
throughput
# Decode
# Decode
decode_latencies
=
[]
for
i
in
range
(
output_len
):
for
i
in
range
(
output_len
):
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
tic
=
time
.
time
()
...
@@ -300,17 +301,18 @@ def latency_test_run_once(
...
@@ -300,17 +301,18 @@ def latency_test_run_once(
latency
=
time
.
time
()
-
tic
latency
=
time
.
time
()
-
tic
tot_latency
+=
latency
tot_latency
+=
latency
throughput
=
batch_size
/
latency
throughput
=
batch_size
/
latency
decode_latencies
.
append
(
latency
)
if
i
<
5
:
if
i
<
5
:
rank_print
(
rank_print
(
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
)
avg
_decode_latency
=
(
tot_latency
-
prefill_latency
)
/
output_len
med
_decode_latency
=
np
.
median
(
decode_latencies
)
avg
_decode_throughput
=
batch_size
/
avg
_decode_latency
med
_decode_throughput
=
batch_size
/
med
_decode_latency
rank_print
(
rank_print
(
f
"Decode.
avg
latency:
{
avg
_decode_latency
:
6.5
f
}
s,
avg
throughput:
{
avg
_decode_throughput
:
9.2
f
}
token/s"
f
"Decode.
median
latency:
{
med
_decode_latency
:
6.5
f
}
s,
median
throughput:
{
med
_decode_throughput
:
9.2
f
}
token/s"
)
)
measurement_results
[
"
avg
_decode_latency"
]
=
avg
_decode_latency
measurement_results
[
"
median
_decode_latency"
]
=
med
_decode_latency
measurement_results
[
"
avg
_decode_throughput"
]
=
avg
_decode_throughput
measurement_results
[
"
median
_decode_throughput"
]
=
med
_decode_throughput
throughput
=
(
input_len
+
output_len
)
*
batch_size
/
tot_latency
throughput
=
(
input_len
+
output_len
)
*
batch_size
/
tot_latency
rank_print
(
rank_print
(
...
...
python/sglang/srt/hf_transformers_utils.py
View file @
79ece2c5
...
@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
...
@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
with
contextlib
.
suppress
(
ValueError
):
with
contextlib
.
suppress
(
ValueError
):
AutoConfig
.
register
(
name
,
cls
)
AutoConfig
.
register
(
name
,
cls
)
from
sglang.srt.utils
import
is_multimodal_model
def
download_from_hf
(
model_path
:
str
):
def
download_from_hf
(
model_path
:
str
):
if
os
.
path
.
exists
(
model_path
):
if
os
.
path
.
exists
(
model_path
):
...
@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
...
@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
return
snapshot_download
(
model_path
,
allow_patterns
=
[
"*.json"
,
"*.bin"
,
"*.model"
])
return
snapshot_download
(
model_path
,
allow_patterns
=
[
"*.json"
,
"*.bin"
,
"*.model"
])
def
get_config_json
(
model_path
:
str
):
with
open
(
os
.
path
.
join
(
model_path
,
"configs.json"
))
as
f
:
config
=
json
.
load
(
f
)
return
config
def
get_config
(
def
get_config
(
model
:
str
,
model
:
str
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment