Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
79ece2c5
Unverified
Commit
79ece2c5
authored
Aug 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Aug 30, 2024
Browse files
Report median instead of mean in bench_latency.py (#1269)
parent
55f5976b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
14 deletions
+8
-14
README.md
README.md
+1
-1
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+7
-5
python/sglang/srt/hf_transformers_utils.py
python/sglang/srt/hf_transformers_utils.py
+0
-8
No files found.
README.md
View file @
79ece2c5
...
@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
...
@@ -233,7 +233,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
### Supported Models
### Supported Models
**Generative Models**
**Generative Models**
-
Exaone 3.0
-
Llama / Llama 2 / Llama 3 / Llama 3.1
-
Llama / Llama 2 / Llama 3 / Llama 3.1
-
Mistral / Mixtral / Mistral NeMo
-
Mistral / Mixtral / Mistral NeMo
-
Gemma / Gemma 2
-
Gemma / Gemma 2
...
@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
...
@@ -253,6 +252,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
-
Grok
-
Grok
-
ChatGLM
-
ChatGLM
-
InternLM 2
-
InternLM 2
-
Exaone 3
**Embedding Models**
**Embedding Models**
...
...
python/sglang/bench_latency.py
View file @
79ece2c5
...
@@ -292,6 +292,7 @@ def latency_test_run_once(
...
@@ -292,6 +292,7 @@ def latency_test_run_once(
measurement_results
[
"prefill_throughput"
]
=
throughput
measurement_results
[
"prefill_throughput"
]
=
throughput
# Decode
# Decode
decode_latencies
=
[]
for
i
in
range
(
output_len
):
for
i
in
range
(
output_len
):
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
tic
=
time
.
time
()
...
@@ -300,17 +301,18 @@ def latency_test_run_once(
...
@@ -300,17 +301,18 @@ def latency_test_run_once(
latency
=
time
.
time
()
-
tic
latency
=
time
.
time
()
-
tic
tot_latency
+=
latency
tot_latency
+=
latency
throughput
=
batch_size
/
latency
throughput
=
batch_size
/
latency
decode_latencies
.
append
(
latency
)
if
i
<
5
:
if
i
<
5
:
rank_print
(
rank_print
(
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
)
avg
_decode_latency
=
(
tot_latency
-
prefill_latency
)
/
output_len
med
_decode_latency
=
np
.
median
(
decode_latencies
)
avg
_decode_throughput
=
batch_size
/
avg
_decode_latency
med
_decode_throughput
=
batch_size
/
med
_decode_latency
rank_print
(
rank_print
(
f
"Decode.
avg
latency:
{
avg
_decode_latency
:
6.5
f
}
s,
avg
throughput:
{
avg
_decode_throughput
:
9.2
f
}
token/s"
f
"Decode.
median
latency:
{
med
_decode_latency
:
6.5
f
}
s,
median
throughput:
{
med
_decode_throughput
:
9.2
f
}
token/s"
)
)
measurement_results
[
"
avg
_decode_latency"
]
=
avg
_decode_latency
measurement_results
[
"
median
_decode_latency"
]
=
med
_decode_latency
measurement_results
[
"
avg
_decode_throughput"
]
=
avg
_decode_throughput
measurement_results
[
"
median
_decode_throughput"
]
=
med
_decode_throughput
throughput
=
(
input_len
+
output_len
)
*
batch_size
/
tot_latency
throughput
=
(
input_len
+
output_len
)
*
batch_size
/
tot_latency
rank_print
(
rank_print
(
...
...
python/sglang/srt/hf_transformers_utils.py
View file @
79ece2c5
...
@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
...
@@ -50,8 +50,6 @@ for name, cls in _CONFIG_REGISTRY.items():
with
contextlib
.
suppress
(
ValueError
):
with
contextlib
.
suppress
(
ValueError
):
AutoConfig
.
register
(
name
,
cls
)
AutoConfig
.
register
(
name
,
cls
)
from
sglang.srt.utils
import
is_multimodal_model
def
download_from_hf
(
model_path
:
str
):
def
download_from_hf
(
model_path
:
str
):
if
os
.
path
.
exists
(
model_path
):
if
os
.
path
.
exists
(
model_path
):
...
@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
...
@@ -60,12 +58,6 @@ def download_from_hf(model_path: str):
return
snapshot_download
(
model_path
,
allow_patterns
=
[
"*.json"
,
"*.bin"
,
"*.model"
])
return
snapshot_download
(
model_path
,
allow_patterns
=
[
"*.json"
,
"*.bin"
,
"*.model"
])
def
get_config_json
(
model_path
:
str
):
with
open
(
os
.
path
.
join
(
model_path
,
"configs.json"
))
as
f
:
config
=
json
.
load
(
f
)
return
config
def
get_config
(
def
get_config
(
model
:
str
,
model
:
str
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment