Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
acbed3ef
Unverified
Commit
acbed3ef
authored
Oct 02, 2023
by
Antoni Baum
Committed by
GitHub
Oct 02, 2023
Browse files
Use monotonic time where appropriate (#1249)
parent
66d18a7f
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
18 additions
and
17 deletions
+18
-17
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+2
-2
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+4
-4
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+4
-4
vllm/core/scheduler.py
vllm/core/scheduler.py
+1
-1
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+2
-1
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+3
-3
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-2
No files found.
benchmarks/benchmark_latency.py
View file @
acbed3ef
...
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
...
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
def
run_to_completion
(
profile
:
bool
=
False
):
def
run_to_completion
(
profile
:
bool
=
False
):
if
profile
:
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
start_time
=
time
.
time
()
start_time
=
time
.
perf_counter
()
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
use_tqdm
=
False
)
end_time
=
time
.
time
()
end_time
=
time
.
perf_counter
()
latency
=
end_time
-
start_time
latency
=
end_time
-
start_time
if
profile
:
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
...
...
benchmarks/benchmark_serving.py
View file @
acbed3ef
...
@@ -105,7 +105,7 @@ async def send_request(
...
@@ -105,7 +105,7 @@ async def send_request(
best_of
:
int
,
best_of
:
int
,
use_beam_search
:
bool
,
use_beam_search
:
bool
,
)
->
None
:
)
->
None
:
request_start_time
=
time
.
time
()
request_start_time
=
time
.
perf_counter
()
headers
=
{
"User-Agent"
:
"Benchmark Client"
}
headers
=
{
"User-Agent"
:
"Benchmark Client"
}
if
backend
==
"vllm"
:
if
backend
==
"vllm"
:
...
@@ -148,7 +148,7 @@ async def send_request(
...
@@ -148,7 +148,7 @@ async def send_request(
if
"error"
not
in
output
:
if
"error"
not
in
output
:
break
break
request_end_time
=
time
.
time
()
request_end_time
=
time
.
perf_counter
()
request_latency
=
request_end_time
-
request_start_time
request_latency
=
request_end_time
-
request_start_time
REQUEST_LATENCY
.
append
((
prompt_len
,
output_len
,
request_latency
))
REQUEST_LATENCY
.
append
((
prompt_len
,
output_len
,
request_latency
))
...
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
...
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
benchmark_start_time
=
time
.
time
()
benchmark_start_time
=
time
.
perf_counter
()
asyncio
.
run
(
benchmark
(
args
.
backend
,
api_url
,
input_requests
,
args
.
best_of
,
asyncio
.
run
(
benchmark
(
args
.
backend
,
api_url
,
input_requests
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
args
.
use_beam_search
,
args
.
request_rate
))
benchmark_end_time
=
time
.
time
()
benchmark_end_time
=
time
.
perf_counter
()
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
print
(
f
"Throughput:
{
args
.
num_prompts
/
benchmark_time
:.
2
f
}
requests/s"
)
print
(
f
"Throughput:
{
args
.
num_prompts
/
benchmark_time
:.
2
f
}
requests/s"
)
...
...
benchmarks/benchmark_throughput.py
View file @
acbed3ef
...
@@ -93,10 +93,10 @@ def run_vllm(
...
@@ -93,10 +93,10 @@ def run_vllm(
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
)
)
start
=
time
.
time
()
start
=
time
.
perf_counter
()
# FIXME(woosuk): Do use internal method.
# FIXME(woosuk): Do use internal method.
llm
.
_run_engine
(
use_tqdm
=
True
)
llm
.
_run_engine
(
use_tqdm
=
True
)
end
=
time
.
time
()
end
=
time
.
perf_counter
()
return
end
-
start
return
end
-
start
...
@@ -118,7 +118,7 @@ def run_hf(
...
@@ -118,7 +118,7 @@ def run_hf(
llm
=
llm
.
cuda
()
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
time
()
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_prompt_len
=
0
max_output_len
=
0
max_output_len
=
0
...
@@ -156,7 +156,7 @@ def run_hf(
...
@@ -156,7 +156,7 @@ def run_hf(
batch
=
[]
batch
=
[]
max_prompt_len
=
0
max_prompt_len
=
0
max_output_len
=
0
max_output_len
=
0
end
=
time
.
time
()
end
=
time
.
perf_counter
()
return
end
-
start
return
end
-
start
...
...
vllm/core/scheduler.py
View file @
acbed3ef
...
@@ -121,7 +121,7 @@ class Scheduler:
...
@@ -121,7 +121,7 @@ class Scheduler:
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
=
{}
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
=
{}
# Fix the current time.
# Fix the current time.
now
=
time
.
time
()
now
=
time
.
monotonic
()
# Join waiting sequences if possible.
# Join waiting sequences if possible.
if
not
self
.
swapped
:
if
not
self
.
swapped
:
...
...
vllm/engine/async_llm_engine.py
View file @
acbed3ef
...
@@ -417,7 +417,8 @@ class AsyncLLMEngine:
...
@@ -417,7 +417,8 @@ class AsyncLLMEngine:
request.
request.
"""
"""
# Preprocess the request.
# Preprocess the request.
arrival_time
=
time
.
time
()
# This should not be used for logging, as it is monotonic time.
arrival_time
=
time
.
monotonic
()
try
:
try
:
stream
=
await
self
.
add_request
(
request_id
,
stream
=
await
self
.
add_request
(
request_id
,
...
...
vllm/engine/llm_engine.py
View file @
acbed3ef
...
@@ -256,10 +256,10 @@ class LLMEngine:
...
@@ -256,10 +256,10 @@ class LLMEngine:
prompt_token_ids: The token IDs of the prompt. If None, we
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use
arrival_time: The arrival time of the request. If None, we use
the current time.
the current
monotonic
time.
"""
"""
if
arrival_time
is
None
:
if
arrival_time
is
None
:
arrival_time
=
time
.
time
()
arrival_time
=
time
.
monotonic
()
if
prompt_token_ids
is
None
:
if
prompt_token_ids
is
None
:
assert
prompt
is
not
None
assert
prompt
is
not
None
prompt_token_ids
=
self
.
tokenizer
.
encode
(
prompt
)
prompt_token_ids
=
self
.
tokenizer
.
encode
(
prompt
)
...
@@ -568,7 +568,7 @@ class LLMEngine:
...
@@ -568,7 +568,7 @@ class LLMEngine:
prompt_run
:
bool
,
prompt_run
:
bool
,
num_batched_tokens
:
int
,
num_batched_tokens
:
int
,
)
->
None
:
)
->
None
:
now
=
time
.
time
()
now
=
time
.
monotonic
()
# Log the number of batched input tokens.
# Log the number of batched input tokens.
if
prompt_run
:
if
prompt_run
:
self
.
num_prompt_tokens
.
append
((
now
,
num_batched_tokens
))
self
.
num_prompt_tokens
.
append
((
now
,
num_batched_tokens
))
...
...
vllm/entrypoints/openai/api_server.py
View file @
acbed3ef
...
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
...
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
model_name
=
request
.
model
model_name
=
request
.
model
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
monotonic
())
try
:
try
:
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
n
=
request
.
n
,
...
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
...
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
if
error_check_ret
is
not
None
:
if
error_check_ret
is
not
None
:
return
error_check_ret
return
error_check_ret
created_time
=
int
(
time
.
time
())
created_time
=
int
(
time
.
monotonic
())
try
:
try
:
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
n
=
request
.
n
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment