Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6e2da515
Unverified
Commit
6e2da515
authored
May 11, 2025
by
Lifu Huang
Committed by
GitHub
May 11, 2025
Browse files
Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by:
Lifu Huang
<
lifu.hlf@gmail.com
>
parent
e9a47f4c
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
54 additions
and
54 deletions
+54
-54
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
+6
-6
benchmark/benchmark_batch/benchmark_batch.py
benchmark/benchmark_batch/benchmark_batch.py
+4
-4
benchmark/benchmark_batch/benchmark_tokenizer.py
benchmark/benchmark_batch/benchmark_tokenizer.py
+4
-4
benchmark/generative_agents/bench_other.py
benchmark/generative_agents/bench_other.py
+2
-2
benchmark/generative_agents/bench_sglang.py
benchmark/generative_agents/bench_sglang.py
+2
-2
benchmark/gsm8k/bench_other.py
benchmark/gsm8k/bench_other.py
+3
-3
benchmark/gsm8k/bench_sglang.py
benchmark/gsm8k/bench_sglang.py
+2
-2
benchmark/hellaswag/bench_other.py
benchmark/hellaswag/bench_other.py
+3
-3
benchmark/hellaswag/bench_sglang.py
benchmark/hellaswag/bench_sglang.py
+2
-2
benchmark/hicache/bench_multiturn.py
benchmark/hicache/bench_multiturn.py
+2
-2
benchmark/json_decode_regex/bench_other.py
benchmark/json_decode_regex/bench_other.py
+2
-2
benchmark/json_decode_regex/bench_sglang.py
benchmark/json_decode_regex/bench_sglang.py
+2
-2
benchmark/json_jump_forward/bench_other.py
benchmark/json_jump_forward/bench_other.py
+4
-4
benchmark/json_jump_forward/bench_sglang.py
benchmark/json_jump_forward/bench_sglang.py
+4
-4
benchmark/json_schema/bench_sglang.py
benchmark/json_schema/bench_sglang.py
+2
-2
benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
...hmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+2
-2
benchmark/kernels/quantization/tuning_block_wise_kernel.py
benchmark/kernels/quantization/tuning_block_wise_kernel.py
+2
-2
benchmark/line_retrieval/bench_sglang.py
benchmark/line_retrieval/bench_sglang.py
+2
-2
benchmark/llava_bench/bench_sglang.py
benchmark/llava_bench/bench_sglang.py
+2
-2
benchmark/llm_judge/bench_other.py
benchmark/llm_judge/bench_other.py
+2
-2
No files found.
benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
View file @
6e2da515
...
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
...
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
tot_time
=
0
tot_time
=
0
for
i
in
range
(
len
(
all_prompts
)):
for
i
in
range
(
len
(
all_prompts
)):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
text_qa
.
run_batch
(
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))),
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))),
)
)
tot_time
+=
time
.
time
()
-
tic
tot_time
+=
time
.
perf_counter
()
-
tic
return
tot_time
return
tot_time
...
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
...
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
tot_time
=
0
tot_time
=
0
for
i
in
range
(
len
(
all_prompts
)):
for
i
in
range
(
len
(
all_prompts
)):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
# Send a hint to cache the prefix
# Send a hint to cache the prefix
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
][:
1
],
[
gen_len
])))
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
][:
1
],
[
gen_len
])))
# Send the batch
# Send the batch
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))))
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
[
i
],
[
gen_len
]
*
len
(
all_prompts
[
i
]))))
tot_time
+=
time
.
time
()
-
tic
tot_time
+=
time
.
perf_counter
()
-
tic
return
tot_time
return
tot_time
...
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
...
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
all_prompts
=
[
x
for
prompt_list
in
all_prompts
for
x
in
prompt_list
]
all_prompts
=
[
x
for
prompt_list
in
all_prompts
for
x
in
prompt_list
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
text_qa
.
run_batch
(
text_qa
.
run_batch
(
list
(
zip
(
all_prompts
,
[
gen_len
]
*
len
(
all_prompts
))),
list
(
zip
(
all_prompts
,
[
gen_len
]
*
len
(
all_prompts
))),
)
)
tot_time
=
time
.
time
()
-
tic
tot_time
=
time
.
perf_counter
()
-
tic
return
tot_time
return
tot_time
...
...
benchmark/benchmark_batch/benchmark_batch.py
View file @
6e2da515
...
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
...
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
}
}
data
=
{
"text"
:
prompts
,
"sampling_params"
:
sampling_params
}
data
=
{
"text"
:
prompts
,
"sampling_params"
:
sampling_params
}
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
try
:
try
:
response
=
requests
.
post
(
response
=
requests
.
post
(
endpoint
.
base_url
+
"/generate"
,
json
=
data
,
timeout
=
3600
endpoint
.
base_url
+
"/generate"
,
json
=
data
,
timeout
=
3600
...
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
...
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
error
=
response
.
json
()
error
=
response
.
json
()
raise
RuntimeError
(
f
"Request
{
request_id
}
failed:
{
error
}
"
)
raise
RuntimeError
(
f
"Request
{
request_id
}
failed:
{
error
}
"
)
result
=
response
.
json
()
result
=
response
.
json
()
elapsed_time
=
(
time
.
time
()
-
start_time
)
*
1000
# Convert to ms
elapsed_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
# Convert to ms
avg_per_prompt
=
elapsed_time
/
len
(
prompts
)
if
prompts
else
0
avg_per_prompt
=
elapsed_time
/
len
(
prompts
)
if
prompts
else
0
return
request_id
,
elapsed_time
,
avg_per_prompt
,
True
,
len
(
prompts
)
return
request_id
,
elapsed_time
,
avg_per_prompt
,
True
,
len
(
prompts
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
...
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
num_requests
=
len
(
batched_prompts
)
num_requests
=
len
(
batched_prompts
)
# Record start time for total latency
# Record start time for total latency
benchmark_start_time
=
time
.
time
()
benchmark_start_time
=
time
.
perf_counter
()
for
i
,
batch_prompts
in
enumerate
(
batched_prompts
):
for
i
,
batch_prompts
in
enumerate
(
batched_prompts
):
request_id
=
i
+
1
request_id
=
i
+
1
...
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
...
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
results
.
append
(
result
)
results
.
append
(
result
)
# Calculate total latency
# Calculate total latency
total_latency
=
(
time
.
time
()
-
benchmark_start_time
)
*
1000
# Convert to ms
total_latency
=
(
time
.
perf_counter
()
-
benchmark_start_time
)
*
1000
# Convert to ms
return
results
,
total_latency
return
results
,
total_latency
...
...
benchmark/benchmark_batch/benchmark_tokenizer.py
View file @
6e2da515
...
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
...
@@ -44,10 +44,10 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for
run
in
range
(
NUM_RUNS
):
for
run
in
range
(
NUM_RUNS
):
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
for
prompt
in
batch_prompts
:
for
prompt
in
batch_prompts
:
tokens
=
tokenizer
.
encode
(
prompt
)
tokens
=
tokenizer
.
encode
(
prompt
)
sequential_time
=
(
time
.
time
()
-
start_time
)
*
1000
sequential_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
sequential_times
.
append
(
sequential_time
)
sequential_times
.
append
(
sequential_time
)
# Batch tokenization using tokenizer()
# Batch tokenization using tokenizer()
...
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
...
@@ -55,9 +55,9 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
for
run
in
range
(
NUM_RUNS
):
for
run
in
range
(
NUM_RUNS
):
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
batch_prompts
=
prompts
[:
batch_size
]
# Use same prompts for fair comparison
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
tokens
=
tokenizer
(
batch_prompts
)
tokens
=
tokenizer
(
batch_prompts
)
batch_time
=
(
time
.
time
()
-
start_time
)
*
1000
batch_time
=
(
time
.
perf_counter
()
-
start_time
)
*
1000
batch_times
.
append
(
batch_time
)
batch_times
.
append
(
batch_time
)
return
{
return
{
...
...
benchmark/generative_agents/bench_other.py
View file @
6e2da515
...
@@ -39,7 +39,7 @@ def main(args):
...
@@ -39,7 +39,7 @@ def main(args):
answer
=
await
call_generate
(
**
arg
,
temperature
=
0
)
answer
=
await
call_generate
(
**
arg
,
temperature
=
0
)
states
.
append
(
answer
)
states
.
append
(
answer
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
# we always sequentially execute agent calls to maintain its dependency
# we always sequentially execute agent calls to maintain its dependency
if
args
.
backend
!=
"lmql"
:
if
args
.
backend
!=
"lmql"
:
for
arg
in
tqdm
(
arguments
):
for
arg
in
tqdm
(
arguments
):
...
@@ -50,7 +50,7 @@ def main(args):
...
@@ -50,7 +50,7 @@ def main(args):
loop
=
asyncio
.
get_event_loop
()
loop
=
asyncio
.
get_event_loop
()
for
arg
in
tqdm
(
arguments
):
for
arg
in
tqdm
(
arguments
):
loop
.
run_until_complete
(
get_one_answer_async
(
arg
))
loop
.
run_until_complete
(
get_one_answer_async
(
arg
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/generative_agents/bench_sglang.py
View file @
6e2da515
...
@@ -35,14 +35,14 @@ def main(args):
...
@@ -35,14 +35,14 @@ def main(args):
states
=
[]
states
=
[]
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
for
a
in
arguments
:
for
a
in
arguments
:
# only a single key in the dict
# only a single key in the dict
for
func
,
arg
in
a
.
items
():
for
func
,
arg
in
a
.
items
():
result
=
func
.
run
(
**
arg
)
result
=
func
.
run
(
**
arg
)
result
.
sync
()
result
.
sync
()
states
.
append
(
result
)
states
.
append
(
result
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/gsm8k/bench_other.py
View file @
6e2da515
...
@@ -75,7 +75,7 @@ def main(args):
...
@@ -75,7 +75,7 @@ def main(args):
)
)
states
[
i
]
=
answer
states
[
i
]
=
answer
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
get_one_answer
(
i
)
...
@@ -106,9 +106,9 @@ def main(args):
...
@@ -106,9 +106,9 @@ def main(args):
for
j
in
range
(
len
(
rets
)):
for
j
in
range
(
len
(
rets
)):
states
[
i
+
j
]
=
rets
[
j
]
states
[
i
+
j
]
=
rets
[
j
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
preds
=
[]
for
i
in
range
(
len
(
states
)):
for
i
in
range
(
len
(
states
)):
...
...
benchmark/gsm8k/bench_sglang.py
View file @
6e2da515
...
@@ -84,14 +84,14 @@ def main(args):
...
@@ -84,14 +84,14 @@ def main(args):
#####################################
#####################################
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
few_shot_gsm8k
.
run_batch
(
states
=
few_shot_gsm8k
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
preds
=
[]
for
i
in
range
(
len
(
states
)):
for
i
in
range
(
len
(
states
)):
...
...
benchmark/hellaswag/bench_other.py
View file @
6e2da515
...
@@ -57,7 +57,7 @@ def main(args):
...
@@ -57,7 +57,7 @@ def main(args):
context
=
few_shot_examples
+
questions
[
i
],
choices
=
choices
[
i
]
context
=
few_shot_examples
+
questions
[
i
],
choices
=
choices
[
i
]
)
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
get_one_answer
(
i
)
...
@@ -82,10 +82,10 @@ def main(args):
...
@@ -82,10 +82,10 @@ def main(args):
for
j
in
range
(
len
(
rets
)):
for
j
in
range
(
len
(
rets
)):
preds
[
i
+
j
]
=
rets
[
j
]
preds
[
i
+
j
]
=
rets
[
j
]
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
asyncio
.
run
(
batched_call
(
batch_size
=
args
.
parallel
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
...
...
benchmark/hellaswag/bench_sglang.py
View file @
6e2da515
...
@@ -68,7 +68,7 @@ def main(args):
...
@@ -68,7 +68,7 @@ def main(args):
#####################################
#####################################
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -76,7 +76,7 @@ def main(args):
...
@@ -76,7 +76,7 @@ def main(args):
progress_bar
=
True
,
progress_bar
=
True
,
)
)
preds
=
[
choices
[
i
].
index
(
rets
[
i
][
"answer"
])
for
i
in
range
(
len
(
rets
))]
preds
=
[
choices
[
i
].
index
(
rets
[
i
][
"answer"
])
for
i
in
range
(
len
(
rets
))]
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
acc
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
...
...
benchmark/hicache/bench_multiturn.py
View file @
6e2da515
...
@@ -261,7 +261,7 @@ class WorkloadGenerator:
...
@@ -261,7 +261,7 @@ class WorkloadGenerator:
client_id
,
payload
=
item
client_id
,
payload
=
item
response
=
await
async_request_sglang_generate
(
payload
,
self
.
url
,
self
.
pbar
)
response
=
await
async_request_sglang_generate
(
payload
,
self
.
url
,
self
.
pbar
)
if
self
.
pbar
.
n
==
self
.
pbar
.
total
:
if
self
.
pbar
.
n
==
self
.
pbar
.
total
:
self
.
finished_time
=
time
.
time
()
self
.
finished_time
=
time
.
perf_counter
()
self
.
response_queue
.
put
((
client_id
,
response
))
self
.
response_queue
.
put
((
client_id
,
response
))
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"Request failed:
{
e
}
"
)
print
(
f
"Request failed:
{
e
}
"
)
...
@@ -334,7 +334,7 @@ class WorkloadGenerator:
...
@@ -334,7 +334,7 @@ class WorkloadGenerator:
request_thread
=
threading
.
Thread
(
target
=
self
.
request_sender
,
daemon
=
True
)
request_thread
=
threading
.
Thread
(
target
=
self
.
request_sender
,
daemon
=
True
)
response_thread
=
threading
.
Thread
(
target
=
self
.
response_handler
,
daemon
=
True
)
response_thread
=
threading
.
Thread
(
target
=
self
.
response_handler
,
daemon
=
True
)
self
.
start_time
=
time
.
time
()
self
.
start_time
=
time
.
perf_counter
()
request_thread
.
start
()
request_thread
.
start
()
response_thread
.
start
()
response_thread
.
start
()
...
...
benchmark/json_decode_regex/bench_other.py
View file @
6e2da515
...
@@ -53,7 +53,7 @@ def main(args):
...
@@ -53,7 +53,7 @@ def main(args):
def
get_one_answer
(
i
):
def
get_one_answer
(
i
):
states
[
i
]
=
json_decode
(
generate
=
call_generate
,
**
arguments
[
i
])
states
[
i
]
=
json_decode
(
generate
=
call_generate
,
**
arguments
[
i
])
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
get_one_answer
(
i
)
...
@@ -68,7 +68,7 @@ def main(args):
...
@@ -68,7 +68,7 @@ def main(args):
for
_
in
rets
:
for
_
in
rets
:
pass
pass
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/json_decode_regex/bench_sglang.py
View file @
6e2da515
...
@@ -63,11 +63,11 @@ def main(args):
...
@@ -63,11 +63,11 @@ def main(args):
json_warm_up
.
run
().
sync
()
json_warm_up
.
run
().
sync
()
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
json_decode
.
run_batch
(
states
=
json_decode
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/json_jump_forward/bench_other.py
View file @
6e2da515
...
@@ -175,7 +175,7 @@ def bench_character(args):
...
@@ -175,7 +175,7 @@ def bench_character(args):
else
:
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
if
args
.
backend
!=
"lmql"
:
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
...
@@ -202,7 +202,7 @@ def bench_character(args):
...
@@ -202,7 +202,7 @@ def bench_character(args):
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
return
states
,
latency
...
@@ -236,7 +236,7 @@ def bench_city_doc(args):
...
@@ -236,7 +236,7 @@ def bench_city_doc(args):
else
:
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
get_one_answer
(
i
)
...
@@ -246,7 +246,7 @@ def bench_city_doc(args):
...
@@ -246,7 +246,7 @@ def bench_city_doc(args):
for
_
in
rets
:
for
_
in
rets
:
pass
pass
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
return
states
,
latency
...
...
benchmark/json_jump_forward/bench_sglang.py
View file @
6e2da515
...
@@ -67,14 +67,14 @@ def bench_city_doc(args):
...
@@ -67,14 +67,14 @@ def bench_city_doc(args):
sgl
.
set_default_backend
(
backend
)
sgl
.
set_default_backend
(
backend
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
city_gen
.
run_batch
(
states
=
city_gen
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
return
states
,
latency
...
@@ -91,14 +91,14 @@ def bench_character(args):
...
@@ -91,14 +91,14 @@ def bench_character(args):
sgl
.
set_default_backend
(
backend
)
sgl
.
set_default_backend
(
backend
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
character_gen
.
run_batch
(
states
=
character_gen
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
return
states
,
latency
return
states
,
latency
...
...
benchmark/json_schema/bench_sglang.py
View file @
6e2da515
...
@@ -85,14 +85,14 @@ def bench_schema(args):
...
@@ -85,14 +85,14 @@ def bench_schema(args):
sgl
.
set_default_backend
(
backend
)
sgl
.
set_default_backend
(
backend
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
schema_gen
.
run_batch
(
states
=
schema_gen
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Check if the outputs are valid
# Check if the outputs are valid
indexes
=
[]
indexes
=
[]
...
...
benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
View file @
6e2da515
...
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
...
@@ -487,7 +487,7 @@ def main(args: argparse.Namespace):
]
]
print
(
f
"Start tuning over
{
len
(
search_space
)
}
configurations..."
)
print
(
f
"Start tuning over
{
len
(
search_space
)
}
configurations..."
)
start
=
time
.
time
()
start
=
time
.
perf_counter
()
configs
=
_distribute
(
configs
=
_distribute
(
"tune"
,
"tune"
,
[
[
...
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
...
@@ -522,7 +522,7 @@ def main(args: argparse.Namespace):
use_int8_w8a16
,
use_int8_w8a16
,
block_shape
,
block_shape
,
)
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
print
(
f
"Tuning took
{
end
-
start
:.
2
f
}
seconds"
)
print
(
f
"Tuning took
{
end
-
start
:.
2
f
}
seconds"
)
else
:
else
:
outputs
=
_distribute
(
outputs
=
_distribute
(
...
...
benchmark/kernels/quantization/tuning_block_wise_kernel.py
View file @
6e2da515
...
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
...
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
config
for
config
in
search_space
if
block_k
%
config
[
"BLOCK_SIZE_K"
]
==
0
config
for
config
in
search_space
if
block_k
%
config
[
"BLOCK_SIZE_K"
]
==
0
]
]
start
=
time
.
time
()
start
=
time
.
perf_counter
()
results
=
{}
results
=
{}
for
shape
in
tqdm
(
weight_shapes
,
desc
=
f
"GPU
{
gpu_id
}
- Shapes"
):
for
shape
in
tqdm
(
weight_shapes
,
desc
=
f
"GPU
{
gpu_id
}
- Shapes"
):
N
,
K
=
shape
[
0
],
shape
[
1
]
N
,
K
=
shape
[
0
],
shape
[
1
]
...
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
...
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
best_configs
=
{
M
:
config
for
M
,
config
in
zip
(
batch_sizes
,
benchmark_results
)}
best_configs
=
{
M
:
config
for
M
,
config
in
zip
(
batch_sizes
,
benchmark_results
)}
save_configs
(
N
,
K
,
block_n
,
block_k
,
best_configs
,
save_path
,
input_type
)
save_configs
(
N
,
K
,
block_n
,
block_k
,
best_configs
,
save_path
,
input_type
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
print
(
f
"Tuning on GPU
{
gpu_id
}
took
{
end
-
start
:.
2
f
}
seconds"
)
print
(
f
"Tuning on GPU
{
gpu_id
}
took
{
end
-
start
:.
2
f
}
seconds"
)
...
...
benchmark/line_retrieval/bench_sglang.py
View file @
6e2da515
...
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
...
@@ -70,7 +70,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
# Select backend
# Select backend
backend
=
select_sglang_backend
(
args
)
backend
=
select_sglang_backend
(
args
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
line_retrieval
.
run_batch
(
states
=
line_retrieval
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
...
@@ -78,7 +78,7 @@ def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
corrects
=
[]
corrects
=
[]
for
i
in
range
(
len
(
arguments
)):
for
i
in
range
(
len
(
arguments
)):
...
...
benchmark/llava_bench/bench_sglang.py
View file @
6e2da515
...
@@ -41,7 +41,7 @@ def main(args):
...
@@ -41,7 +41,7 @@ def main(args):
sgl
.
set_default_backend
(
backend
)
sgl
.
set_default_backend
(
backend
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
.
tqdm
(
range
(
len
(
lines
))):
for
i
in
tqdm
.
tqdm
(
range
(
len
(
lines
))):
image_file
=
arguments
[
i
][
"image_file"
]
image_file
=
arguments
[
i
][
"image_file"
]
...
@@ -52,7 +52,7 @@ def main(args):
...
@@ -52,7 +52,7 @@ def main(args):
states
=
image_qa
.
run_batch
(
states
=
image_qa
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/llm_judge/bench_other.py
View file @
6e2da515
...
@@ -85,7 +85,7 @@ def main(args):
...
@@ -85,7 +85,7 @@ def main(args):
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
if
args
.
backend
!=
"lmql"
:
...
@@ -120,7 +120,7 @@ def main(args):
...
@@ -120,7 +120,7 @@ def main(args):
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment