Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
acbed3ef
Unverified
Commit
acbed3ef
authored
Oct 02, 2023
by
Antoni Baum
Committed by
GitHub
Oct 02, 2023
Browse files
Use monotonic time where appropriate (#1249)
parent
66d18a7f
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
18 additions
and
17 deletions
+18
-17
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+2
-2
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+4
-4
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+4
-4
vllm/core/scheduler.py
vllm/core/scheduler.py
+1
-1
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+2
-1
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+3
-3
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-2
No files found.
benchmarks/benchmark_latency.py
View file @
acbed3ef
...
...
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
def
run_to_completion
(
profile
:
bool
=
False
):
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
end_time
=
time
.
time
()
end_time
=
time
.
perf_counter
()
latency
=
end_time
-
start_time
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
...
...
benchmarks/benchmark_serving.py
View file @
acbed3ef
...
...
@@ -105,7 +105,7 @@ async def send_request(
best_of
:
int
,
use_beam_search
:
bool
,
)
->
None
:
request_start_time
=
time
.
time
()
request_start_time
=
time
.
perf_counter
()
headers
=
{
"User-Agent"
:
"Benchmark Client"
}
if
backend
==
"vllm"
:
...
...
@@ -148,7 +148,7 @@ async def send_request(
if
"error"
not
in
output
:
break
request_end_time
=
time
.
time
()
request_end_time
=
time
.
perf_counter
()
request_latency
=
request_end_time
-
request_start_time
REQUEST_LATENCY
.
append
((
prompt_len
,
output_len
,
request_latency
))
...
...
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
benchmark_start_time
=
time
.
time
()
benchmark_start_time
=
time
.
perf_counter
()
asyncio
.
run
(
benchmark
(
args
.
backend
,
api_url
,
input_requests
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
benchmark_end_time
=
time
.
time
()
benchmark_end_time
=
time
.
perf_counter
()
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
print
(
f
"Throughput:
{
args
.
num_prompts
/
benchmark_time
:.
2
f
}
requests/s"
)
...
...
benchmarks/benchmark_throughput.py
View file @
acbed3ef
...
...
@@ -93,10 +93,10 @@ def run_vllm(
sampling_params
=
sampling_params
,
)
start
=
time
.
time
()
start
=
time
.
perf_counter
()
# FIXME(woosuk): Do use internal method.
llm
.
_run_engine
(
use_tqdm
=
True
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
return
end
-
start
...
...
@@ -118,7 +118,7 @@ def run_hf(
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
time
()
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
...
...
@@ -156,7 +156,7 @@ def run_hf(
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
time
()
end
=
time
.
perf_counter
()
return
end
-
start
...
...
vllm/core/scheduler.py
View file @
acbed3ef
...
...
@@ -121,7 +121,7 @@ class Scheduler:
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
=
{}
# Fix the current time.
now
=
time
.
time
()
now
=
time
.
monotonic
()
# Join waiting sequences if possible.
if
not
self
.
swapped
:
...
...
vllm/engine/async_llm_engine.py
View file @
acbed3ef
...
...
@@ -417,7 +417,8 @@ class AsyncLLMEngine:
request.
"""
# Preprocess the request.
arrival_time
=
time
.
time
()
# This should not be used for logging, as it is monotonic time.
arrival_time
=
time
.
monotonic
()
try
:
stream
=
await
self
.
add_request
(
request_id
,
...
...
vllm/engine/llm_engine.py
View file @
acbed3ef
...
...
@@ -256,10 +256,10 @@ class LLMEngine:
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use
the current time.
the current
monotonic
time.
"""
if
arrival_time
is
None
:
arrival_time
=
time
.
time
()
arrival_time
=
time
.
monotonic
()
if
prompt_token_ids
is
None
:
assert
prompt
is
not
None
prompt_token_ids
=
self
.
tokenizer
.
encode
(
prompt
)
...
...
@@ -568,7 +568,7 @@ class LLMEngine:
prompt_run
:
bool
,
num_batched_tokens
:
int
,
)
->
None
:
now
=
time
.
time
()
now
=
time
.
monotonic
()
# Log the number of batched input tokens.
if
prompt_run
:
self
.
num_prompt_tokens
.
append
((
now
,
num_batched_tokens
))
...
...
vllm/entrypoints/openai/api_server.py
View file @
acbed3ef
...
...
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
model_name
=
request
.
model
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
monotonic
())
try
:
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
...
...
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
if
error_check_ret
is
not
None
:
return
error_check_ret
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
monotonic
())
try
:
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment