Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6e2da515
Unverified
Commit
6e2da515
authored
May 11, 2025
by
Lifu Huang
Committed by
GitHub
May 11, 2025
Browse files
Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by:
Lifu Huang
<
lifu.hlf@gmail.com
>
parent
e9a47f4c
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
54 additions
and
54 deletions
+54
-54
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
+6
-6
benchmark/benchmark_batch/benchmark_batch.py
benchmark/benchmark_batch/benchmark_batch.py
+4
-4
benchmark/benchmark_batch/benchmark_tokenizer.py
benchmark/benchmark_batch/benchmark_tokenizer.py
+4
-4
benchmark/generative_agents/bench_other.py
benchmark/generative_agents/bench_other.py
+2
-2
benchmark/generative_agents/bench_sglang.py
benchmark/generative_agents/bench_sglang.py
+2
-2
benchmark/gsm8k/bench_other.py
benchmark/gsm8k/bench_other.py
+3
-3
benchmark/gsm8k/bench_sglang.py
benchmark/gsm8k/bench_sglang.py
+2
-2
benchmark/hellaswag/bench_other.py
benchmark/hellaswag/bench_other.py
+3
-3
benchmark/hellaswag/bench_sglang.py
benchmark/hellaswag/bench_sglang.py
+2
-2
benchmark/hicache/bench_multiturn.py
benchmark/hicache/bench_multiturn.py
+2
-2
benchmark/json_decode_regex/bench_other.py
benchmark/json_decode_regex/bench_other.py
+2
-2
benchmark/json_decode_regex/bench_sglang.py
benchmark/json_decode_regex/bench_sglang.py
+2
-2
benchmark/json_jump_forward/bench_other.py
benchmark/json_jump_forward/bench_other.py
+4
-4
benchmark/json_jump_forward/bench_sglang.py
benchmark/json_jump_forward/bench_sglang.py
+4
-4
benchmark/json_schema/bench_sglang.py
benchmark/json_schema/bench_sglang.py
+2
-2
benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
...hmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+2
-2
benchmark/kernels/quantization/tuning_block_wise_kernel.py
benchmark/kernels/quantization/tuning_block_wise_kernel.py
+2
-2
benchmark/line_retrieval/bench_sglang.py
benchmark/line_retrieval/bench_sglang.py
+2
-2
benchmark/llava_bench/bench_sglang.py
benchmark/llava_bench/bench_sglang.py
+2
-2
benchmark/llm_judge/bench_other.py
benchmark/llm_judge/bench_other.py
+2
-2
No files found.
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
View file @
6e2da515
...
...
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
tot_time
=
0
for
i
in
range
(
len
(
all_prompts
)):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))),
)
tot_time
+=
time
.
time
()
-
tic
tot_time
+=
time
.
perf_counter
()
-
tic
return
tot_time
...
...
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
tot_time
=
0
for
i
in
range
(
len
(
all_prompts
)):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
# Send a hint to cache the prefix
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
][:
1
],
[
gen_len
])))
# Send the batch
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))))
tot_time
+=
time
.
time
()
-
tic
tot_time
+=
time
.
perf_counter
()
-
tic
return
tot_time
...
...
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
all_prompts
=
[
x
for
prompt_list
in
all_prompts
for
x
in
prompt_list
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
,
[
gen_len
]
*
len
(
all_prompts
))),
)
tot_time
=
time
.
time
()
-
tic
tot_time
=
time
.
perf_counter
()
-
tic
return
tot_time
...
...
benchmark/benchmark_batch/benchmark_batch.py
View file @
6e2da515
...
...
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
}
data
=
{
"text"
:
prompts
,
"sampling_params"
:
sampling_params
}
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
try
:
response
=
requests
.
post
(
endpoint
.
base_url
+
"/generate"
,
json
=
data
,
timeout
=
3600
...
...
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
error
=
response
.
json
()
raise
RuntimeError
(
f
"Request
{
request_id
}
failed:
{
error
}
"
)
result
=
response
.
json
()
elapsed_time
=
(
time
.
time
()
-
start_time
)
*
1000
# Convert to ms
elapsed_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
# Convert to ms
avg_per_prompt
=
elapsed_time
/
len
(
prompts
)
if
prompts
else
0
return
request_id
,
elapsed_time
,
avg_per_prompt
,
True
,
len
(
prompts
)
except
Exception
as
e
:
...
...
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
num_requests
=
len
(
batched_prompts
)
# Record start time for total latency
benchmark_start_time
=
time
.
time
()
benchmark_start_time
=
time
.
perf_counter
()
for
i
,
batch_prompts
in
enumerate
(
batched_prompts
):
request_id
=
i
+
1
...
...
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
results
.
append
(
result
)
# Calculate total latency
total_latency
=
(
time
.
time
()
-
benchmark_start_time
)
*
1000
# Convert to ms
total_latency
=
(
time
.
perf_counter
()
-
benchmark_start_time
)
*
1000
# Convert to ms
return
results
,
total_latency
...
...
benchmark/benchmark_batch/benchmark_tokenizer.py
View file @
6e2da515
...
...
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for
run
in
range
(
NUM_RUNS
):
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
for
prompt
in
batch_prompts
:
tokens
=
tokenizer
.
encode
(
prompt
)
sequential_time
=
(
time
.
time
()
-
start_time
)
*
1000
sequential_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
sequential_times
.
append
(
sequential_time
)
# Batch tokenization using tokenizer()
...
...
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for
run
in
range
(
NUM_RUNS
):
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
tokens
=
tokenizer
(
batch_prompts
)
batch_time
=
(
time
.
time
()
-
start_time
)
*
1000
batch_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
batch_times
.
append
(
batch_time
)
return
{
...
...
benchmark/generative_agents/bench_other.py
View file @
6e2da515
...
...
@@ -39,7 +39,7 @@ def main(args):
answer
=
await
call_generate
(
**
arg
,
temperature
=
0
)
states
.
append
(
answer
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
# we always sequentially execute agent calls to maintain its dependency
if
args
.
backend
!=
"lmql"
:
for
arg
in
tqdm
(
arguments
):
...
...
@@ -50,7 +50,7 @@ def main(args):
loop
=
asyncio
.
get_event_loop
()
for
arg
in
tqdm
(
arguments
):
loop
.
run_until_complete
(
get_one_answer_async
(
arg
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/generative_agents/bench_sglang.py
View file @
6e2da515
...
...
@@ -35,14 +35,14 @@ def main(args):
states
=
[]
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
for
a
in
arguments
:
# only a single key in the dict
for
func
,
arg
in
a
.
items
():
result
=
func
.
run
(
**
arg
)
result
.
sync
()
states
.
append
(
result
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/gsm8k/bench_other.py
View file @
6e2da515
...
...
@@ -75,7 +75,7 @@ def main(args):
)
states
[
i
]
=
answer
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
...
...
@@ -106,9 +106,9 @@ def main(args):
for
j
in
range
(
len
(
rets
)):
states
[
i
+
j
]
=
rets
[
j
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
for
i
in
range
(
len
(
states
)):
...
...
benchmark/gsm8k/bench_sglang.py
View file @
6e2da515
...
...
@@ -84,14 +84,14 @@ def main(args):
#####################################
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
few_shot_gsm8k
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
for
i
in
range
(
len
(
states
)):
...
...
benchmark/hellaswag/bench_other.py
View file @
6e2da515
...
...
@@ -57,7 +57,7 @@ def main(args):
context
=
few_shot_examples
+
questions
[
i
],
choices
=
choices
[
i
]
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
...
...
@@ -82,10 +82,10 @@ def main(args):
for
j
in
range
(
len
(
rets
)):
preds
[
i
+
j
]
=
rets
[
j
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
...
...
benchmark/hellaswag/bench_sglang.py
View file @
6e2da515
...
...
@@ -68,7 +68,7 @@ def main(args):
#####################################
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -76,7 +76,7 @@ def main(args):
progress_bar
=
True
,
)
preds
=
[
choices
[
i
].
index
(
rets
[
i
][
"answer"
])
for
i
in
range
(
len
(
rets
))]
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
...
...
benchmark/hicache/bench_multiturn.py
View file @
6e2da515
...
...
@@ -261,7 +261,7 @@ class WorkloadGenerator:
client_id
,
payload
=
item
response
=
await
async_request_sglang_generate
(
payload
,
self
.
url
,
self
.
pbar
)
if
self
.
pbar
.
n
==
self
.
pbar
.
total
:
self
.
finished_time
=
time
.
time
()
self
.
finished_time
=
time
.
perf_counter
()
self
.
response_queue
.
put
((
client_id
,
response
))
except
Exception
as
e
:
print
(
f
"Request failed:
{
e
}
"
)
...
...
@@ -334,7 +334,7 @@ class WorkloadGenerator:
request_thread
=
threading
.
Thread
(
target
=
self
.
request_sender
,
daemon
=
True
)
response_thread
=
threading
.
Thread
(
target
=
self
.
response_handler
,
daemon
=
True
)
self
.
start_time
=
time
.
time
()
self
.
start_time
=
time
.
perf_counter
()
request_thread
.
start
()
response_thread
.
start
()
...
...
benchmark/json_decode_regex/bench_other.py
View file @
6e2da515
...
...
@@ -53,7 +53,7 @@ def main(args):
def
get_one_answer
(
i
):
states
[
i
]
=
json_decode
(
generate
=
call_generate
,
**
arguments
[
i
])
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
...
...
@@ -68,7 +68,7 @@ def main(args):
for
_
in
rets
:
pass
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/json_decode_regex/bench_sglang.py
View file @
6e2da515
...
...
@@ -63,11 +63,11 @@ def main(args):
json_warm_up
.
run
().
sync
()
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
json_decode
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/json_jump_forward/bench_other.py
View file @
6e2da515
...
...
@@ -175,7 +175,7 @@ def bench_character(args):
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
if
args
.
parallel
==
1
:
...
...
@@ -202,7 +202,7 @@ def bench_character(args):
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
...
...
@@ -236,7 +236,7 @@ def bench_city_doc(args):
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
...
...
@@ -246,7 +246,7 @@ def bench_city_doc(args):
for
_
in
rets
:
pass
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
...
...
benchmark/json_jump_forward/bench_sglang.py
View file @
6e2da515
...
...
@@ -67,14 +67,14 @@ def bench_city_doc(args):
sgl
.
set_default_backend
(
backend
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
city_gen
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
...
...
@@ -91,14 +91,14 @@ def bench_character(args):
sgl
.
set_default_backend
(
backend
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
character_gen
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
...
...
benchmark/json_schema/bench_sglang.py
View file @
6e2da515
...
...
@@ -85,14 +85,14 @@ def bench_schema(args):
sgl
.
set_default_backend
(
backend
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
schema_gen
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Check if the outputs are valid
indexes
=
[]
...
...
benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
View file @
6e2da515
...
...
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
]
print
(
f
"Start tuning over
{
len
(
search_space
)
}
configurations..."
)
start
=
time
.
time
()
start
=
time
.
perf_counter
()
configs
=
_distribute
(
"tune"
,
[
...
...
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
use_int8_w8a16
,
block_shape
,
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
print
(
f
"Tuning took
{
end
-
start
:.
2
f
}
seconds"
)
else
:
outputs
=
_distribute
(
...
...
benchmark/kernels/quantization/tuning_block_wise_kernel.py
View file @
6e2da515
...
...
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
config
for
config
in
search_space
if
block_k
%
config
[
"BLOCK_SIZE_K"
]
==
0
]
start
=
time
.
time
()
start
=
time
.
perf_counter
()
results
=
{}
for
shape
in
tqdm
(
weight_shapes
,
desc
=
f
"GPU
{
gpu_id
}
- Shapes"
):
N
,
K
=
shape
[
0
],
shape
[
1
]
...
...
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
best_configs
=
{
M
:
config
for
M
,
config
in
zip
(
batch_sizes
,
benchmark_results
)}
save_configs
(
N
,
K
,
block_n
,
block_k
,
best_configs
,
save_path
,
input_type
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
print
(
f
"Tuning on GPU
{
gpu_id
}
took
{
end
-
start
:.
2
f
}
seconds"
)
...
...
benchmark/line_retrieval/bench_sglang.py
View file @
6e2da515
...
...
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
# Select backend
backend
=
select_sglang_backend
(
args
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
line_retrieval
.
run_batch
(
arguments
,
temperature
=
0
,
...
...
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
corrects
=
[]
for
i
in
range
(
len
(
arguments
)):
...
...
benchmark/llava_bench/bench_sglang.py
View file @
6e2da515
...
...
@@ -41,7 +41,7 @@ def main(args):
sgl
.
set_default_backend
(
backend
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
.
tqdm
(
range
(
len
(
lines
))):
image_file
=
arguments
[
i
][
"image_file"
]
...
...
@@ -52,7 +52,7 @@ def main(args):
states
=
image_qa
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/llm_judge/bench_other.py
View file @
6e2da515
...
...
@@ -85,7 +85,7 @@ def main(args):
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
...
...
@@ -120,7 +120,7 @@ def main(args):
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment