Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6e2da515
Unverified
Commit
6e2da515
authored
May 11, 2025
by
Lifu Huang
Committed by
GitHub
May 11, 2025
Browse files
Replace time.time() to time.perf_counter() for benchmarking. (#6178)
Signed-off-by:
Lifu Huang
<
lifu.hlf@gmail.com
>
parent
e9a47f4c
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
60 additions
and
60 deletions
+60
-60
benchmark/tip_suggestion/bench_sglang.py
benchmark/tip_suggestion/bench_sglang.py
+2
-2
benchmark/tree_of_thought_deep/bench_other.py
benchmark/tree_of_thought_deep/bench_other.py
+2
-2
benchmark/tree_of_thought_deep/bench_sglang.py
benchmark/tree_of_thought_deep/bench_sglang.py
+2
-2
benchmark/tree_of_thought_v0/bench_other.py
benchmark/tree_of_thought_v0/bench_other.py
+2
-2
benchmark/tree_of_thought_v0/bench_sglang.py
benchmark/tree_of_thought_v0/bench_sglang.py
+2
-2
python/sglang/test/few_shot_gsm8k.py
python/sglang/test/few_shot_gsm8k.py
+2
-2
python/sglang/test/few_shot_gsm8k_engine.py
python/sglang/test/few_shot_gsm8k_engine.py
+2
-2
python/sglang/test/run_eval.py
python/sglang/test/run_eval.py
+2
-2
python/sglang/test/test_programs.py
python/sglang/test/test_programs.py
+4
-4
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+7
-7
sgl-router/py_test/test_launch_server.py
sgl-router/py_test/test_launch_server.py
+4
-4
test/srt/experiment_runner.py
test/srt/experiment_runner.py
+5
-5
test/srt/models/test_encoder_embedding_models.py
test/srt/models/test_encoder_embedding_models.py
+4
-4
test/srt/test_gptqmodel_dynamic.py
test/srt/test_gptqmodel_dynamic.py
+4
-4
test/srt/test_release_memory_occupation.py
test/srt/test_release_memory_occupation.py
+4
-4
test/srt/test_torch_compile.py
test/srt/test_torch_compile.py
+2
-2
test/srt/test_torch_compile_moe.py
test/srt/test_torch_compile_moe.py
+2
-2
test/srt/test_torchao.py
test/srt/test_torchao.py
+2
-2
test/srt/test_update_weights_from_distributed.py
test/srt/test_update_weights_from_distributed.py
+4
-4
test/srt/test_update_weights_from_tensor.py
test/srt/test_update_weights_from_tensor.py
+2
-2
No files found.
benchmark/tip_suggestion/bench_sglang.py
View file @
6e2da515
...
@@ -65,11 +65,11 @@ def main(args):
...
@@ -65,11 +65,11 @@ def main(args):
sgl
.
set_default_backend
(
select_sglang_backend
(
args
))
sgl
.
set_default_backend
(
select_sglang_backend
(
args
))
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
suggest_tips
.
run_batch
(
states
=
suggest_tips
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/tree_of_thought_deep/bench_other.py
View file @
6e2da515
...
@@ -138,7 +138,7 @@ def main(args):
...
@@ -138,7 +138,7 @@ def main(args):
# Run requests
# Run requests
states
=
[
None
]
*
len
(
questions
)
states
=
[
None
]
*
len
(
questions
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
backend
!=
"lmql"
:
if
args
.
backend
!=
"lmql"
:
def
get_one_answer
(
i
):
def
get_one_answer
(
i
):
...
@@ -177,7 +177,7 @@ def main(args):
...
@@ -177,7 +177,7 @@ def main(args):
tasks
=
[
get_one_answer_async
(
k
)
for
k
in
bt
]
tasks
=
[
get_one_answer_async
(
k
)
for
k
in
bt
]
loop
.
run_until_complete
(
asyncio
.
gather
(
*
tasks
))
loop
.
run_until_complete
(
asyncio
.
gather
(
*
tasks
))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
answers_text
=
[]
for
s
in
states
:
for
s
in
states
:
...
...
benchmark/tree_of_thought_deep/bench_sglang.py
View file @
6e2da515
...
@@ -119,7 +119,7 @@ def main(args):
...
@@ -119,7 +119,7 @@ def main(args):
backend
=
select_sglang_backend
(
args
)
backend
=
select_sglang_backend
(
args
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
tree_search
.
run_batch
(
states
=
tree_search
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -127,7 +127,7 @@ def main(args):
...
@@ -127,7 +127,7 @@ def main(args):
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
answers_text
=
[]
for
s
in
states
:
for
s
in
states
:
answers_text
.
append
([
x
for
xs
in
s
.
ret_value
for
x
in
xs
])
answers_text
.
append
([
x
for
xs
in
s
.
ret_value
for
x
in
xs
])
...
...
benchmark/tree_of_thought_v0/bench_other.py
View file @
6e2da515
...
@@ -121,7 +121,7 @@ def main(args):
...
@@ -121,7 +121,7 @@ def main(args):
def
get_one_answer
(
i
):
def
get_one_answer
(
i
):
states
[
i
]
=
tree_search
(
**
arguments
[
i
],
call_generate
=
call_generate
)
states
[
i
]
=
tree_search
(
**
arguments
[
i
],
call_generate
=
call_generate
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
if
args
.
parallel
==
1
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
questions
))):
for
i
in
tqdm
(
range
(
len
(
questions
))):
get_one_answer
(
i
)
get_one_answer
(
i
)
...
@@ -134,7 +134,7 @@ def main(args):
...
@@ -134,7 +134,7 @@ def main(args):
)
)
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
answers_text
=
[]
for
s
in
states
:
for
s
in
states
:
...
...
benchmark/tree_of_thought_v0/bench_sglang.py
View file @
6e2da515
...
@@ -107,7 +107,7 @@ def main(args):
...
@@ -107,7 +107,7 @@ def main(args):
backend
=
select_sglang_backend
(
args
)
backend
=
select_sglang_backend
(
args
)
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
tree_search
.
run_batch
(
states
=
tree_search
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -115,7 +115,7 @@ def main(args):
...
@@ -115,7 +115,7 @@ def main(args):
num_threads
=
args
.
parallel
,
num_threads
=
args
.
parallel
,
progress_bar
=
True
,
progress_bar
=
True
,
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
answers_text
=
[]
answers_text
=
[]
for
s
in
states
:
for
s
in
states
:
answers_text
.
append
([
x
for
xs
in
s
[
"answer"
]
for
x
in
xs
])
answers_text
.
append
([
x
for
xs
in
s
[
"answer"
]
for
x
in
xs
])
...
...
python/sglang/test/few_shot_gsm8k.py
View file @
6e2da515
...
@@ -90,7 +90,7 @@ def run_eval(args):
...
@@ -90,7 +90,7 @@ def run_eval(args):
#####################################
#####################################
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
states
=
few_shot_gsm8k
.
run_batch
(
states
=
few_shot_gsm8k
.
run_batch
(
arguments
,
arguments
,
temperature
=
args
.
temperature
if
hasattr
(
args
,
"temperature"
)
else
0
,
temperature
=
args
.
temperature
if
hasattr
(
args
,
"temperature"
)
else
0
,
...
@@ -99,7 +99,7 @@ def run_eval(args):
...
@@ -99,7 +99,7 @@ def run_eval(args):
return_logprob
=
getattr
(
args
,
"return_logprob"
,
None
),
return_logprob
=
getattr
(
args
,
"return_logprob"
,
None
),
logprob_start_len
=
getattr
(
args
,
"logprob_start_len"
,
None
),
logprob_start_len
=
getattr
(
args
,
"logprob_start_len"
,
None
),
)
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
preds
=
[]
preds
=
[]
for
i
in
range
(
len
(
states
)):
for
i
in
range
(
len
(
states
)):
...
...
python/sglang/test/few_shot_gsm8k_engine.py
View file @
6e2da515
...
@@ -89,7 +89,7 @@ def run_eval(args):
...
@@ -89,7 +89,7 @@ def run_eval(args):
}
}
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
loop
=
asyncio
.
get_event_loop
()
loop
=
asyncio
.
get_event_loop
()
...
@@ -98,7 +98,7 @@ def run_eval(args):
...
@@ -98,7 +98,7 @@ def run_eval(args):
)
)
# End requests
# End requests
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Shutdown the engine
# Shutdown the engine
engine
.
shutdown
()
engine
.
shutdown
()
...
...
python/sglang/test/run_eval.py
View file @
6e2da515
...
@@ -71,9 +71,9 @@ def run_eval(args):
...
@@ -71,9 +71,9 @@ def run_eval(args):
)
)
# Run eval
# Run eval
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
eval_obj
(
sampler
)
result
=
eval_obj
(
sampler
)
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Dump reports
# Dump reports
metrics
=
result
.
metrics
|
{
"score"
:
result
.
score
}
metrics
=
result
.
metrics
|
{
"score"
:
result
.
score
}
...
...
python/sglang/test/test_programs.py
View file @
6e2da515
...
@@ -503,7 +503,7 @@ def test_hellaswag_select():
...
@@ -503,7 +503,7 @@ def test_hellaswag_select():
#####################################
#####################################
# Run requests
# Run requests
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -514,13 +514,13 @@ def test_hellaswag_select():
...
@@ -514,13 +514,13 @@ def test_hellaswag_select():
preds
=
[]
preds
=
[]
for
i
,
ret
in
enumerate
(
rets
):
for
i
,
ret
in
enumerate
(
rets
):
preds
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
preds
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
latency
=
time
.
time
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
accuracy
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
accuracy
=
np
.
mean
(
np
.
array
(
preds
)
==
np
.
array
(
labels
))
# Test generator style of run_batch
# Test generator style of run_batch
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
rets
=
few_shot_hellaswag
.
run_batch
(
rets
=
few_shot_hellaswag
.
run_batch
(
arguments
,
arguments
,
temperature
=
0
,
temperature
=
0
,
...
@@ -531,7 +531,7 @@ def test_hellaswag_select():
...
@@ -531,7 +531,7 @@ def test_hellaswag_select():
preds_gen
=
[]
preds_gen
=
[]
for
i
,
ret
in
enumerate
(
rets
):
for
i
,
ret
in
enumerate
(
rets
):
preds_gen
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
preds_gen
.
append
(
choices
[
i
].
index
(
ret
[
"answer"
]))
latency_gen
=
time
.
time
()
-
tic
latency_gen
=
time
.
perf_counter
()
-
tic
# Compute accuracy
# Compute accuracy
accuracy_gen
=
np
.
mean
(
np
.
array
(
preds_gen
)
==
np
.
array
(
labels
))
accuracy_gen
=
np
.
mean
(
np
.
array
(
preds_gen
)
==
np
.
array
(
labels
))
...
...
python/sglang/test/test_utils.py
View file @
6e2da515
...
@@ -449,9 +449,9 @@ def popen_launch_server(
...
@@ -449,9 +449,9 @@ def popen_launch_server(
else
:
else
:
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
,
env
=
env
)
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
,
env
=
env
)
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
with
requests
.
Session
()
as
session
:
with
requests
.
Session
()
as
session
:
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
try
:
headers
=
{
headers
=
{
"Content-Type"
:
"application/json; charset=utf-8"
,
"Content-Type"
:
"application/json; charset=utf-8"
,
...
@@ -584,7 +584,7 @@ class TestFile:
...
@@ -584,7 +584,7 @@ class TestFile:
def
run_unittest_files
(
files
:
List
[
TestFile
],
timeout_per_file
:
float
):
def
run_unittest_files
(
files
:
List
[
TestFile
],
timeout_per_file
:
float
):
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
success
=
True
success
=
True
for
i
,
file
in
enumerate
(
files
):
for
i
,
file
in
enumerate
(
files
):
...
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
...
@@ -599,13 +599,13 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
f
".
\n
.
\n
Begin (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
python3
{
filename
}
\n
.
\n
.
\n
"
,
f
".
\n
.
\n
Begin (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
python3
{
filename
}
\n
.
\n
.
\n
"
,
flush
=
True
,
flush
=
True
,
)
)
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
process
=
subprocess
.
Popen
(
process
=
subprocess
.
Popen
(
[
"python3"
,
filename
],
stdout
=
None
,
stderr
=
None
,
env
=
os
.
environ
[
"python3"
,
filename
],
stdout
=
None
,
stderr
=
None
,
env
=
os
.
environ
)
)
process
.
wait
()
process
.
wait
()
elapsed
=
time
.
time
()
-
tic
elapsed
=
time
.
perf_counter
()
-
tic
print
(
print
(
f
".
\n
.
\n
End (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
{
filename
=
}
,
{
elapsed
=
:.
0
f
}
,
{
estimated_time
=
}
\n
.
\n
.
\n
"
,
f
".
\n
.
\n
End (
{
i
}
/
{
len
(
files
)
-
1
}
):
\n
{
filename
=
}
,
{
elapsed
=
:.
0
f
}
,
{
estimated_time
=
}
\n
.
\n
.
\n
"
,
...
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
...
@@ -631,9 +631,9 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
break
break
if
success
:
if
success
:
print
(
f
"Success. Time elapsed:
{
time
.
time
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
print
(
f
"Success. Time elapsed:
{
time
.
perf_counter
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
else
:
else
:
print
(
f
"Fail. Time elapsed:
{
time
.
time
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
print
(
f
"Fail. Time elapsed:
{
time
.
perf_counter
()
-
tic
:.
2
f
}
s"
,
flush
=
True
)
return
0
if
success
else
-
1
return
0
if
success
else
-
1
...
...
sgl-router/py_test/test_launch_server.py
View file @
6e2da515
...
@@ -92,9 +92,9 @@ def popen_launch_router(
...
@@ -92,9 +92,9 @@ def popen_launch_router(
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
)
process
=
subprocess
.
Popen
(
command
,
stdout
=
None
,
stderr
=
None
)
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
with
requests
.
Session
()
as
session
:
with
requests
.
Session
()
as
session
:
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
try
:
response
=
session
.
get
(
f
"
{
base_url
}
/health"
)
response
=
session
.
get
(
f
"
{
base_url
}
/health"
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
...
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
...
@@ -155,11 +155,11 @@ def terminate_and_wait(process, timeout=300):
return
return
process
.
terminate
()
process
.
terminate
()
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
while
process
.
poll
()
is
None
:
while
process
.
poll
()
is
None
:
print
(
f
"Terminating process
{
process
.
pid
}
"
)
print
(
f
"Terminating process
{
process
.
pid
}
"
)
if
time
.
time
()
-
start_time
>
timeout
:
if
time
.
perf_counter
()
-
start_time
>
timeout
:
raise
TimeoutError
(
raise
TimeoutError
(
f
"Process
{
process
.
pid
}
failed to terminate within
{
timeout
}
s"
f
"Process
{
process
.
pid
}
failed to terminate within
{
timeout
}
s"
)
)
...
...
test/srt/experiment_runner.py
View file @
6e2da515
...
@@ -184,9 +184,9 @@ class ExperimentRunner:
...
@@ -184,9 +184,9 @@ class ExperimentRunner:
self
.
logger
=
logging
.
getLogger
(
__name__
)
self
.
logger
=
logging
.
getLogger
(
__name__
)
def
wait_for_server
(
self
,
port
:
int
,
timeout
:
int
=
300
)
->
bool
:
def
wait_for_server
(
self
,
port
:
int
,
timeout
:
int
=
300
)
->
bool
:
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
while
time
.
time
()
-
start_time
<
timeout
:
while
time
.
perf_counter
()
-
start_time
<
timeout
:
try
:
try
:
response
=
requests
.
get
(
f
"http://localhost:
{
port
}
/health"
)
response
=
requests
.
get
(
f
"http://localhost:
{
port
}
/health"
)
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
...
@@ -197,7 +197,7 @@ class ExperimentRunner:
...
@@ -197,7 +197,7 @@ class ExperimentRunner:
return
False
return
False
def
run_task
(
self
,
config
:
TaskConfig
)
->
TaskResult
:
def
run_task
(
self
,
config
:
TaskConfig
)
->
TaskResult
:
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
client_output
=
[]
client_output
=
[]
try
:
try
:
...
@@ -247,7 +247,7 @@ class ExperimentRunner:
...
@@ -247,7 +247,7 @@ class ExperimentRunner:
name
=
config
.
name
,
name
=
config
.
name
,
success
=
True
,
success
=
True
,
output
=
formatted_output
,
output
=
formatted_output
,
runtime
=
time
.
time
()
-
start_time
,
runtime
=
time
.
perf_counter
()
-
start_time
,
timestamp
=
datetime
.
now
().
isoformat
(),
timestamp
=
datetime
.
now
().
isoformat
(),
)
)
...
@@ -256,7 +256,7 @@ class ExperimentRunner:
...
@@ -256,7 +256,7 @@ class ExperimentRunner:
name
=
config
.
name
,
name
=
config
.
name
,
success
=
False
,
success
=
False
,
output
=
str
(
e
),
output
=
str
(
e
),
runtime
=
time
.
time
()
-
start_time
,
runtime
=
time
.
perf_counter
()
-
start_time
,
timestamp
=
datetime
.
now
().
isoformat
(),
timestamp
=
datetime
.
now
().
isoformat
(),
)
)
...
...
test/srt/models/test_encoder_embedding_models.py
View file @
6e2da515
...
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
...
@@ -79,9 +79,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
# warm up
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
st_start_time
=
time
.
time
()
st_start_time
=
time
.
perf_counter
()
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
hf_outputs
=
hf_runner
.
forward
(
truncated_prompts
)
st_end_time
=
time
.
time
()
st_end_time
=
time
.
perf_counter
()
with
SRTRunner
(
with
SRTRunner
(
model_path
,
model_path
,
...
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
...
@@ -95,9 +95,9 @@ class TestEncoderEmbeddingModels(CustomTestCase):
# warm up
# warm up
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
sgl_start_time
=
time
.
time
()
sgl_start_time
=
time
.
perf_counter
()
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
srt_outputs
=
srt_runner
.
forward
(
truncated_prompts
)
sgl_end_time
=
time
.
time
()
sgl_end_time
=
time
.
perf_counter
()
transformer_time
=
st_end_time
-
st_start_time
transformer_time
=
st_end_time
-
st_start_time
sgl_time
=
sgl_end_time
-
sgl_start_time
sgl_time
=
sgl_end_time
-
sgl_start_time
...
...
test/srt/test_gptqmodel_dynamic.py
View file @
6e2da515
...
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
...
@@ -130,9 +130,9 @@ class TestGPTQModelDynamic(CustomTestCase):
def
test_throughput
(
self
):
def
test_throughput
(
self
):
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
self
.
run_decode
(
max_tokens
)
result
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"result = `
{
result
}
`"
)
print
(
f
"result = `
{
result
}
`"
)
...
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
...
@@ -185,9 +185,9 @@ class TestGPTQModelDynamicWithMarlin(CustomTestCase):
def
test_throughput
(
self
):
def
test_throughput
(
self
):
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
result
=
self
.
run_decode
(
max_tokens
)
result
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"result = `
{
result
}
`"
)
print
(
f
"result = `
{
result
}
`"
)
...
...
test/srt/test_release_memory_occupation.py
View file @
6e2da515
...
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
...
@@ -42,10 +42,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
)
)
print
(
"release_memory_occupation start"
)
print
(
"release_memory_occupation start"
)
t
=
time
.
time
()
t
=
time
.
perf_counter
()
engine
.
release_memory_occupation
()
engine
.
release_memory_occupation
()
if
_DEBUG_EXTRA
:
if
_DEBUG_EXTRA
:
print
(
"release_memory_occupation"
,
time
.
time
()
-
t
)
print
(
"release_memory_occupation"
,
time
.
perf_counter
()
-
t
)
if
_DEBUG_EXTRA
:
if
_DEBUG_EXTRA
:
time
.
sleep
(
5
)
time
.
sleep
(
5
)
...
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
...
@@ -60,10 +60,10 @@ class TestReleaseMemoryOccupation(CustomTestCase):
time
.
sleep
(
5
)
time
.
sleep
(
5
)
print
(
"resume_memory_occupation start"
)
print
(
"resume_memory_occupation start"
)
t
=
time
.
time
()
t
=
time
.
perf_counter
()
engine
.
resume_memory_occupation
()
engine
.
resume_memory_occupation
()
if
_DEBUG_EXTRA
:
if
_DEBUG_EXTRA
:
print
(
"resume_memory_occupation"
,
time
.
time
()
-
t
)
print
(
"resume_memory_occupation"
,
time
.
perf_counter
()
-
t
)
self
.
assertEqual
(
self
.
assertEqual
(
_try_allocate_big_tensor
(),
_try_allocate_big_tensor
(),
...
...
test/srt/test_torch_compile.py
View file @
6e2da515
...
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
...
@@ -62,9 +62,9 @@ class TestTorchCompile(CustomTestCase):
res
=
self
.
run_decode
(
16
)
res
=
self
.
run_decode
(
16
)
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"
{
res
=
}
"
)
print
(
f
"
{
res
=
}
"
)
throughput
=
max_tokens
/
(
tok
-
tic
)
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
...
...
test/srt/test_torch_compile_moe.py
View file @
6e2da515
...
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
...
@@ -62,9 +62,9 @@ class TestTorchCompileMoe(CustomTestCase):
res
=
self
.
run_decode
(
16
)
res
=
self
.
run_decode
(
16
)
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
f
"
{
res
=
}
"
)
print
(
f
"
{
res
=
}
"
)
throughput
=
max_tokens
/
(
tok
-
tic
)
throughput
=
max_tokens
/
(
tok
-
tic
)
self
.
assertGreaterEqual
(
throughput
,
285
)
self
.
assertGreaterEqual
(
throughput
,
285
)
...
...
test/srt/test_torchao.py
View file @
6e2da515
...
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
...
@@ -61,9 +61,9 @@ class TestTorchAO(CustomTestCase):
max_tokens
=
256
max_tokens
=
256
tic
=
time
.
time
()
tic
=
time
.
perf_counter
()
res
=
self
.
run_decode
(
max_tokens
)
res
=
self
.
run_decode
(
max_tokens
)
tok
=
time
.
time
()
tok
=
time
.
perf_counter
()
print
(
res
[
"text"
])
print
(
res
[
"text"
])
throughput
=
max_tokens
/
(
tok
-
tic
)
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
...
...
test/srt/test_update_weights_from_distributed.py
View file @
6e2da515
...
@@ -164,7 +164,7 @@ def init_process_hf(
...
@@ -164,7 +164,7 @@ def init_process_hf(
)
)
dist
.
barrier
(
group
=
group
,
device_ids
=
[
rank
])
dist
.
barrier
(
group
=
group
,
device_ids
=
[
rank
])
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
time_begin_broadcast
=
time
.
time
()
time_begin_broadcast
=
time
.
perf_counter
()
# The last parameter is lm_head.weight, which is tied
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
# with embed_tokens.weight. Actually, we only need
...
@@ -182,7 +182,7 @@ def init_process_hf(
...
@@ -182,7 +182,7 @@ def init_process_hf(
group
=
group
,
group
=
group
,
)
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
time_end_broadcast
=
time
.
time
()
time_end_broadcast
=
time
.
perf_counter
()
# Measure the latency of broadcasting/weights update.
# Measure the latency of broadcasting/weights update.
broadcast_time
=
time_end_broadcast
-
time_begin_broadcast
broadcast_time
=
time_end_broadcast
-
time_begin_broadcast
...
@@ -282,7 +282,7 @@ def init_process_sgl(
...
@@ -282,7 +282,7 @@ def init_process_sgl(
)
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
time_begin_update
=
time
.
time
()
time_begin_update
=
time
.
perf_counter
()
# The last parameter is lm_head.weight, which is tied
# The last parameter is lm_head.weight, which is tied
# with embed_tokens.weight. Actually, we only need
# with embed_tokens.weight. Actually, we only need
...
@@ -312,7 +312,7 @@ def init_process_sgl(
...
@@ -312,7 +312,7 @@ def init_process_sgl(
},
},
)
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
time_end_update
=
time
.
time
()
time_end_update
=
time
.
perf_counter
()
# Measure the latency of broadcast/weights update.
# Measure the latency of broadcast/weights update.
update_time
=
time_end_update
-
time_begin_update
update_time
=
time_end_update
-
time_begin_update
...
...
test/srt/test_update_weights_from_tensor.py
View file @
6e2da515
...
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
...
@@ -21,9 +21,9 @@ def test_update_weights_from_tensor(tp_size):
memory_before
=
torch
.
cuda
.
memory_allocated
()
memory_before
=
torch
.
cuda
.
memory_allocated
()
new_tensor
=
torch
.
full
((
16384
,
2048
),
1.5
,
device
=
"cuda"
)
new_tensor
=
torch
.
full
((
16384
,
2048
),
1.5
,
device
=
"cuda"
)
time_start
=
time
.
time
()
time_start
=
time
.
perf_counter
()
engine
.
update_weights_from_tensor
([(
x
,
new_tensor
)
for
x
in
param_names
])
engine
.
update_weights_from_tensor
([(
x
,
new_tensor
)
for
x
in
param_names
])
print
(
f
"Time delta:
{
time
.
time
()
-
time_start
:.
03
f
}
"
)
print
(
f
"Time delta:
{
time
.
perf_counter
()
-
time_start
:.
03
f
}
"
)
for
param_name
in
param_names
[:
3
]:
for
param_name
in
param_names
[:
3
]:
_check_param
(
engine
,
param_name
,
[
1.5
]
*
5
)
_check_param
(
engine
,
param_name
,
[
1.5
]
*
5
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment