Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6e2da515
Unverified
Commit
6e2da515
authored
May 11, 2025
by
Lifu Huang
Committed by
GitHub
May 11, 2025
Browse files
Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by:
Lifu Huang
<
lifu.hlf@gmail.com
>
parent
e9a47f4c
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
60 additions
and
60 deletions
+60
-60
benchmark/tip_suggestion/bench_sglang.py
benchmark/tip_suggestion/bench_sglang.py
+2
-2
benchmark/tree_of_thought_deep/bench_other.py
benchmark/tree_of_thought_deep/bench_other.py
+2
-2
benchmark/tree_of_thought_deep/bench_sglang.py
benchmark/tree_of_thought_deep/bench_sglang.py
+2
-2
benchmark/tree_of_thought_v0/bench_other.py
benchmark/tree_of_thought_v0/bench_other.py
+2
-2
benchmark/tree_of_thought_v0/bench_sglang.py
benchmark/tree_of_thought_v0/bench_sglang.py
+2
-2
python/sglang/test/few_shot_gsm8k.py
python/sglang/test/few_shot_gsm8k.py
+2
-2
python/sglang/test/few_shot_gsm8k_engine.py
python/sglang/test/few_shot_gsm8k_engine.py
+2
-2
python/sglang/test/run_eval.py
python/sglang/test/run_eval.py
+2
-2
python/sglang/test/test_programs.py
python/sglang/test/test_programs.py
+4
-4
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+7
-7
sgl-router/py_test/test_launch_server.py
sgl-router/py_test/test_launch_server.py
+4
-4
test/srt/experiment_runner.py
test/srt/experiment_runner.py
+5
-5
test/srt/models/test_encoder_embedding_models.py
test/srt/models/test_encoder_embedding_models.py
+4
-4
test/srt/test_gptqmodel_dynamic.py
test/srt/test_gptqmodel_dynamic.py
+4
-4
test/srt/test_release_memory_occupation.py
test/srt/test_release_memory_occupation.py
+4
-4
test/srt/test_torch_compile.py
test/srt/test_torch_compile.py
+2
-2
test/srt/test_torch_compile_moe.py
test/srt/test_torch_compile_moe.py
+2
-2
test/srt/test_torchao.py
test/srt/test_torchao.py
+2
-2
test/srt/test_update_weights_from_distributed.py
test/srt/test_update_weights_from_distributed.py
+4
-4
test/srt/test_update_weights_from_tensor.py
test/srt/test_update_weights_from_tensor.py
+2
-2
No files found.
benchmark/tip_suggestion/bench_sglang.py
View file @
6e2da515
...
...
@@ -65,11 +65,11 @@ def main(args):
sgl
.
set_default_backend
(
select_sglang_backend
(
args
))
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
suggest_tips
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/tree_of_thought_deep/bench_other.py
View file @
6e2da515
...
...
@@ -138,7 +138,7 @@ def main(args):
# Run requests
states
=
[
None
]
*
len
(
questions
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
def
get_one_answer
(
i
):
...
...
@@ -177,7 +177,7 @@ def main(args):
tasks
=
[
get_one_answer_async
(
k
)
for
k
in
bt
]
loop
.
run_until_complete
(
asyncio
.
gather
(
*
tasks
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
for
s
in
states
:
...
...
benchmark/tree_of_thought_deep/bench_sglang.py
View file @
6e2da515
...
...
@@ -119,7 +119,7 @@ def main(args):
backend
=
select_sglang_backend
(
args
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
tree_search
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -127,7 +127,7 @@ def main(args):
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
for
s
in
states
:
answers_text
.
append
([
x
for
xs
in
s
.
ret_value
for
x
in
xs
])
...
...
benchmark/tree_of_thought_v0/bench_other.py
View file @
6e2da515
...
...
@@ -121,7 +121,7 @@ def main(args):
def
get_one_answer
(
i
):
states
[
i
]
=
tree_search
(
**
arguments
[
i
],
call_generate
=
call_generate
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
...
...
@@ -134,7 +134,7 @@ def main(args):
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
for
s
in
states
:
...
...
benchmark/tree_of_thought_v0/bench_sglang.py
View file @
6e2da515
...
...
@@ -107,7 +107,7 @@ def main(args):
backend
=
select_sglang_backend
(
args
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
tree_search
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -115,7 +115,7 @@ def main(args):
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
for
s
in
states
:
answers_text
.
append
([
x
for
xs
in
s
[
"answer"
]
for
x
in
xs
])
...
...
python/sglang/test/few_shot_gsm8k.py
View file @
6e2da515
...
...
@@ -90,7 +90,7 @@ def run_eval(args):
#####################################
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
few_shot_gsm8k
.
run_batch
(
arguments
,
temperature
=
args
.
temperature
if
hasattr
(
args
,
"temperature"
)
else
0
,
...
...
@@ -99,7 +99,7 @@ def run_eval(args):
return_logprob
=
getattr
(
args
,
"return_logprob"
,
None
),
logprob_start_len
=
getattr
(
args
,
"logprob_start_len"
,
None
),
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
for
i
in
range
(
len
(
states
)):
...
...
python/sglang/test/few_shot_gsm8k_engine.py
View file @
6e2da515
...
...
@@ -89,7 +89,7 @@ def run_eval(args):
}
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
loop
=
asyncio
.
get_event_loop
()
...
...
@@ -98,7 +98,7 @@ def run_eval(args):
)
# End requests
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Shutdown the engine
engine
.
shutdown
()
...
...
python/sglang/test/run_eval.py
View file @
6e2da515
...
...
@@ -71,9 +71,9 @@ def run_eval(args):
)
# Run eval
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
eval_obj
(
sampler
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Dump reports
metrics
=
result
.
metrics
|
{
"score"
:
result
.
score
}
...
...
python/sglang/test/test_programs.py
View file @
6e2da515
...
...
@@ -503,7 +503,7 @@ def test_hellaswag_select():
#####################################
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -514,13 +514,13 @@ def test_hellaswag_select():
preds
=
[]
for
i
,
ret
in
enumerate
(
rets
):
preds
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
accuracy
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
# Test generator style of run_batch
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -531,7 +531,7 @@ def test_hellaswag_select():
preds_gen
=
[]
for
i
,
ret
in
enumerate
(
rets
):
preds_gen
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
latency_gen
=
time
.
time
()
-
tic
latency_gen
=
time
.
perf_counter
()
-
tic
# Compute accuracy
accuracy_gen
=
np
.
mean
(
np
.
array
(
preds_gen
)
==
np
.
array
(
labels
))
...
...
python/sglang/test/test_utils.py
View file @
6e2da515
...
...
@@ -449,9 +449,9 @@ def popen_launch_server(
else
:
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
,
env
=
env
)
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
with
requests
.
Session
()
as
session
:
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
headers
=
{
"Content-Type"
:
"application/json; charset=utf-8"
,
...
...
@@ -584,7 +584,7 @@ class TestFile:
def
run_unittest_files
(
files
:
List
[
TestFile
],
timeout_per_file
:
float
):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
success
=
True
for
i
,
file
in
enumerate
(
files
):
...
...
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
f
".
\n
.
\n
Begin (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
python3
{
filename
}
\n
.
\n
.
\n
"
,
flush
=
True
,
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
process
=
subprocess
.
Popen
(
[
"python3"
,
filename
],
stdout
=
None
,
stderr
=
None
,
env
=
os
.
environ
)
process
.
wait
()
elapsed
=
time
.
time
()
-
tic
elapsed
=
time
.
perf_counter
()
-
tic
print
(
f
".
\n
.
\n
End (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
{
filename
=
}
,
{
elapsed
=
:.
0
f
}
,
{
estimated_time
=
}
\n
.
\n
.
\n
"
,
...
...
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
break
if
success
:
print
(
f
"Success. Time elapsed:
{
time
.
time
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
print
(
f
"Success. Time elapsed:
{
time
.
perf_counter
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
else
:
print
(
f
"Fail. Time elapsed:
{
time
.
time
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
print
(
f
"Fail. Time elapsed:
{
time
.
perf_counter
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
return
0
if
success
else
-
1
...
...
sgl-router/py_test/test_launch_server.py
View file @
6e2da515
...
...
@@ -92,9 +92,9 @@ def popen_launch_router(
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
)
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
with
requests
.
Session
()
as
session
:
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
response
=
session
.
get
(
f
"
{
base_url
}
/health"
)
if
response
.
status_code
==
200
:
...
...
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
return
process
.
terminate
()
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
while
process
.
poll
()
is
None
:
print
(
f
"Terminating process
{
process
.
pid
}
"
)
if
time
.
time
()
-
start_time
>
timeout
:
if
time
.
perf_counter
()
-
start_time
>
timeout
:
raise
TimeoutError
(
f
"Process
{
process
.
pid
}
failed to terminate within
{
timeout
}
s"
)
...
...
test/srt/experiment_runner.py
View file @
6e2da515
...
...
@@ -184,9 +184,9 @@ class ExperimentRunner:
self
.
logger
=
logging
.
getLogger
(
__name__
)
def
wait_for_server
(
self
,
port
:
int
,
timeout
:
int
=
300
)
->
bool
:
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
response
=
requests
.
get
(
f
"http://localhost:
{
port
}
/health"
)
if
response
.
status_code
==
200
:
...
...
@@ -197,7 +197,7 @@ class ExperimentRunner:
return
False
def
run_task
(
self
,
config
:
TaskConfig
)
->
TaskResult
:
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
client_output
=
[]
try
:
...
...
@@ -247,7 +247,7 @@ class ExperimentRunner:
name
=
config
.
name
,
success
=
True
,
output
=
formatted_output
,
runtime
=
time
.
time
()
-
start_time
,
runtime
=
time
.
perf_counter
()
-
start_time
,
timestamp
=
datetime
.
now
().
isoformat
(),
)
...
...
@@ -256,7 +256,7 @@ class ExperimentRunner:
name
=
config
.
name
,
success
=
False
,
output
=
str
(
e
),
runtime
=
time
.
time
()
-
start_time
,
runtime
=
time
.
perf_counter
()
-
start_time
,
timestamp
=
datetime
.
now
().
isoformat
(),
)
...
...
test/srt/models/test_encoder_embedding_models.py
View file @
6e2da515
...
...
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
st_start_time
=
time
.
time
()
st_start_time
=
time
.
perf_counter
()
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
st_end_time
=
time
.
time
()
st_end_time
=
time
.
perf_counter
()
with
SRTRunner
(
model_path
,
...
...
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
sgl_start_time
=
time
.
time
()
sgl_start_time
=
time
.
perf_counter
()
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
sgl_end_time
=
time
.
time
()
sgl_end_time
=
time
.
perf_counter
()
transformer_time
=
st_end_time
-
st_start_time
sgl_time
=
sgl_end_time
-
sgl_start_time
...
...
test/srt/test_gptqmodel_dynamic.py
View file @
6e2da515
...
...
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def
test_throughput
(
self
):
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"result = `
{
result
}
`"
)
...
...
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def
test_throughput
(
self
):
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"result = `
{
result
}
`"
)
...
...
test/srt/test_release_memory_occupation.py
View file @
6e2da515
...
...
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
)
print
(
"release_memory_occupation start"
)
t
=
time
.
time
()
t
=
time
.
perf_counter
()
engine
.
release_memory_occupation
()
if
_DEBUG_EXTRA
:
print
(
"release_memory_occupation"
,
time
.
time
()
-
t
)
print
(
"release_memory_occupation"
,
time
.
perf_counter
()
-
t
)
if
_DEBUG_EXTRA
:
time
.
sleep
(
5
)
...
...
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time
.
sleep
(
5
)
print
(
"resume_memory_occupation start"
)
t
=
time
.
time
()
t
=
time
.
perf_counter
()
engine
.
resume_memory_occupation
()
if
_DEBUG_EXTRA
:
print
(
"resume_memory_occupation"
,
time
.
time
()
-
t
)
print
(
"resume_memory_occupation"
,
time
.
perf_counter
()
-
t
)
self
.
assertEqual
(
_try_allocate_big_tensor
(),
...
...
test/srt/test_torch_compile.py
View file @
6e2da515
...
...
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res
=
self
.
run_decode
(
16
)
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"
{
res
=
}
"
)
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
...
...
test/srt/test_torch_compile_moe.py
View file @
6e2da515
...
...
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res
=
self
.
run_decode
(
16
)
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"
{
res
=
}
"
)
throughput
=
max_tokens
/
(
tok
-
tic
)
self
.
assertGreaterEqual
(
throughput
,
285
)
...
...
test/srt/test_torchao.py
View file @
6e2da515
...
...
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
res
[
"text"
])
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
...
...
test/srt/test_update_weights_from_distributed.py
View file @
6e2da515
...
...
@@ -164,7 +164,7 @@ def init_process_hf(
)
dist
.
barrier
(
group
=
group
,
device_ids
=
[
rank
])
torch
.
cuda
.
synchronize
()
time_begin_broadcast
=
time
.
time
()
time_begin_broadcast
=
time
.
perf_counter
()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
...
...
@@ -182,7 +182,7 @@ def init_process_hf(
group
=
group
,
)
torch
.
cuda
.
synchronize
()
time_end_broadcast
=
time
.
time
()
time_end_broadcast
=
time
.
perf_counter
()
# Measure the latency of broadcasting/weights update.
broadcast_time
=
time_end_broadcast
-
time_begin_broadcast
...
...
@@ -282,7 +282,7 @@ def init_process_sgl(
)
torch
.
cuda
.
synchronize
()
time_begin_update
=
time
.
time
()
time_begin_update
=
time
.
perf_counter
()
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
...
...
@@ -312,7 +312,7 @@ def init_process_sgl(
},
)
torch
.
cuda
.
synchronize
()
time_end_update
=
time
.
time
()
time_end_update
=
time
.
perf_counter
()
# Measure the latency of broadcast/weights update.
update_time
=
time_end_update
-
time_begin_update
...
...
test/srt/test_update_weights_from_tensor.py
View file @
6e2da515
...
...
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before
=
torch
.
cuda
.
memory_allocated
()
new_tensor
=
torch
.
full
((
16384
,
2048
),
1.5
,
device
=
"cuda"
)
time_start
=
time
.
time
()
time_start
=
time
.
perf_counter
()
engine
.
update_weights_from_tensor
([(
x
,
new_tensor
)
for
x
in
param_names
])
print
(
f
"Time delta:
{
time
.
time
()
-
time_start
:.
03
f
}
"
)
print
(
f
"Time delta:
{
time
.
perf_counter
()
-
time_start
:.
03
f
}
"
)
for
param_name
in
param_names
[:
3
]:
_check_param
(
engine
,
param_name
,
[
1.5
]
*
5
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment