Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
017d9f15
Unverified
Commit
017d9f15
authored
Feb 20, 2024
by
Antoni Baum
Committed by
GitHub
Feb 20, 2024
Browse files
Add metrics to RequestOutput (#2876)
parent
181b27d8
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
61 additions
and
9 deletions
+61
-9
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+1
-1
vllm/core/policy.py
vllm/core/policy.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+3
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+5
-2
vllm/outputs.py
vllm/outputs.py
+9
-1
vllm/sequence.py
vllm/sequence.py
+42
-4
No files found.
tests/async_engine/test_request_tracker.py
View file @
017d9f15
...
@@ -64,7 +64,7 @@ def test_request_tracker():
...
@@ -64,7 +64,7 @@ def test_request_tracker():
stream_5
=
tracker
.
add_request
(
"5"
)
stream_5
=
tracker
.
add_request
(
"5"
)
assert
tracker
.
new_requests_event
.
flag
assert
tracker
.
new_requests_event
.
flag
tracker
.
process_request_output
(
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
bool
(
finished
)
))
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
finished
)
==
1
assert
len
(
finished
)
==
1
...
...
vllm/core/policy.py
View file @
017d9f15
...
@@ -33,7 +33,7 @@ class FCFS(Policy):
...
@@ -33,7 +33,7 @@ class FCFS(Policy):
now
:
float
,
now
:
float
,
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
)
->
float
:
)
->
float
:
return
now
-
seq_group
.
arrival_time
return
now
-
seq_group
.
metrics
.
arrival_time
class
PolicyFactory
:
class
PolicyFactory
:
...
...
vllm/core/scheduler.py
View file @
017d9f15
...
@@ -365,10 +365,13 @@ class Scheduler:
...
@@ -365,10 +365,13 @@ class Scheduler:
# This function call changes the internal states of the scheduler
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
# such as self.running, self.swapped, and self.waiting.
scheduler_outputs
=
self
.
_schedule
()
scheduler_outputs
=
self
.
_schedule
()
now
=
time
.
time
()
# Create input data structures.
# Create input data structures.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
seq_group
.
maybe_set_first_scheduled_time
(
now
)
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
block_tables
:
Dict
[
int
,
List
[
int
]]
=
{}
block_tables
:
Dict
[
int
,
List
[
int
]]
=
{}
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
...
...
vllm/engine/llm_engine.py
View file @
017d9f15
...
@@ -728,6 +728,7 @@ class LLMEngine:
...
@@ -728,6 +728,7 @@ class LLMEngine:
def
_process_model_outputs
(
def
_process_model_outputs
(
self
,
output
:
SamplerOutput
,
self
,
output
:
SamplerOutput
,
scheduler_outputs
:
SchedulerOutputs
)
->
List
[
RequestOutput
]:
scheduler_outputs
:
SchedulerOutputs
)
->
List
[
RequestOutput
]:
now
=
time
.
time
()
# Update the scheduled sequence groups with the model outputs.
# Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups
=
scheduler_outputs
.
scheduled_seq_groups
scheduled_seq_groups
=
scheduler_outputs
.
scheduled_seq_groups
for
seq_group
,
outputs
in
zip
(
scheduled_seq_groups
,
output
):
for
seq_group
,
outputs
in
zip
(
scheduled_seq_groups
,
output
):
...
@@ -739,6 +740,7 @@ class LLMEngine:
...
@@ -739,6 +740,7 @@ class LLMEngine:
# Create the outputs.
# Create the outputs.
request_outputs
:
List
[
RequestOutput
]
=
[]
request_outputs
:
List
[
RequestOutput
]
=
[]
for
seq_group
in
scheduled_seq_groups
:
for
seq_group
in
scheduled_seq_groups
:
seq_group
.
maybe_set_first_token_time
(
now
)
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_outputs
.
append
(
request_output
)
request_outputs
.
append
(
request_output
)
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
:
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
:
...
@@ -876,11 +878,12 @@ class LLMEngine:
...
@@ -876,11 +878,12 @@ class LLMEngine:
# Latency Timings.
# Latency Timings.
time_last_iters
=
[]
time_last_iters
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
# Time since last token. (n.b. updates seq_group.last_token_time)
# Time since last token. (n.b. updates seq_group.
metrics.
last_token_time)
time_last_iters
.
append
(
seq_group
.
get_last_latency
(
now
))
time_last_iters
.
append
(
seq_group
.
get_last_latency
(
now
))
# Time since arrival for all finished requests.
# Time since arrival for all finished requests.
if
seq_group
.
is_finished
():
if
seq_group
.
is_finished
():
time_e2e_requests
.
append
(
now
-
seq_group
.
arrival_time
)
time_e2e_requests
.
append
(
now
-
seq_group
.
metrics
.
arrival_time
)
time_to_first_tokens
=
time_last_iters
if
prompt_run
else
[]
time_to_first_tokens
=
time_last_iters
if
prompt_run
else
[]
time_per_output_tokens
=
[]
if
prompt_run
else
time_last_iters
time_per_output_tokens
=
[]
if
prompt_run
else
time_last_iters
...
...
vllm/outputs.py
View file @
017d9f15
from
typing
import
List
,
Optional
from
typing
import
List
,
Optional
import
time
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SequenceGroup
,
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SequenceGroup
,
SequenceStatus
)
SequenceStatus
,
RequestMetrics
)
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -60,6 +61,7 @@ class RequestOutput:
...
@@ -60,6 +61,7 @@ class RequestOutput:
prompt_logprobs: The log probabilities to return per prompt token.
prompt_logprobs: The log probabilities to return per prompt token.
outputs: The output sequences of the request.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
finished: Whether the whole request is finished.
metrics: Metrics associated with the request.
lora_request: The LoRA request that was used to generate the output.
lora_request: The LoRA request that was used to generate the output.
"""
"""
...
@@ -71,6 +73,7 @@ class RequestOutput:
...
@@ -71,6 +73,7 @@ class RequestOutput:
prompt_logprobs
:
Optional
[
PromptLogprobs
],
prompt_logprobs
:
Optional
[
PromptLogprobs
],
outputs
:
List
[
CompletionOutput
],
outputs
:
List
[
CompletionOutput
],
finished
:
bool
,
finished
:
bool
,
metrics
:
Optional
[
RequestMetrics
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
)
->
None
:
)
->
None
:
self
.
request_id
=
request_id
self
.
request_id
=
request_id
...
@@ -79,6 +82,7 @@ class RequestOutput:
...
@@ -79,6 +82,7 @@ class RequestOutput:
self
.
prompt_logprobs
=
prompt_logprobs
self
.
prompt_logprobs
=
prompt_logprobs
self
.
outputs
=
outputs
self
.
outputs
=
outputs
self
.
finished
=
finished
self
.
finished
=
finished
self
.
metrics
=
metrics
self
.
lora_request
=
lora_request
self
.
lora_request
=
lora_request
@
classmethod
@
classmethod
...
@@ -115,12 +119,15 @@ class RequestOutput:
...
@@ -115,12 +119,15 @@ class RequestOutput:
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_logprobs
=
seq_group
.
prompt_logprobs
prompt_logprobs
=
seq_group
.
prompt_logprobs
finished
=
seq_group
.
is_finished
()
finished
=
seq_group
.
is_finished
()
finished_time
=
time
.
time
()
if
finished
else
None
seq_group
.
set_finished_time
(
finished_time
)
return
cls
(
seq_group
.
request_id
,
return
cls
(
seq_group
.
request_id
,
prompt
,
prompt
,
prompt_token_ids
,
prompt_token_ids
,
prompt_logprobs
,
prompt_logprobs
,
outputs
,
outputs
,
finished
,
finished
,
seq_group
.
metrics
,
lora_request
=
seq_group
.
lora_request
)
lora_request
=
seq_group
.
lora_request
)
def
__repr__
(
self
)
->
str
:
def
__repr__
(
self
)
->
str
:
...
@@ -130,4 +137,5 @@ class RequestOutput:
...
@@ -130,4 +137,5 @@ class RequestOutput:
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"outputs=
{
self
.
outputs
}
, "
f
"outputs=
{
self
.
outputs
}
, "
f
"finished=
{
self
.
finished
}
, "
f
"finished=
{
self
.
finished
}
, "
f
"metrics=
{
self
.
metrics
}
, "
f
"lora_request=
{
self
.
lora_request
}
)"
)
f
"lora_request=
{
self
.
lora_request
}
)"
)
vllm/sequence.py
View file @
017d9f15
"""Sequence and its related classes."""
"""Sequence and its related classes."""
import
copy
import
copy
import
enum
import
enum
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Union
from
vllm.block
import
LogicalTokenBlock
from
vllm.block
import
LogicalTokenBlock
...
@@ -49,6 +50,25 @@ class SequenceStatus(enum.Enum):
...
@@ -49,6 +50,25 @@ class SequenceStatus(enum.Enum):
return
finish_reason
return
finish_reason
@
dataclass
class
RequestMetrics
:
"""Metrics associated with a request.
Args:
arrival_time: The time when the request arrived.
first_scheduled_time: The time when the request was first scheduled.
first_token_time: The time when the first token was generated.
time_in_queue: The time the request spent in the queue.
finished_time: The time when the request was finished.
"""
arrival_time
:
float
last_token_time
:
float
first_scheduled_time
:
Optional
[
float
]
first_token_time
:
Optional
[
float
]
time_in_queue
:
Optional
[
float
]
finished_time
:
Optional
[
float
]
=
None
class
SequenceData
:
class
SequenceData
:
"""Data associated with a sequence.
"""Data associated with a sequence.
...
@@ -252,8 +272,11 @@ class SequenceGroup:
...
@@ -252,8 +272,11 @@ class SequenceGroup:
self
.
request_id
=
request_id
self
.
request_id
=
request_id
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
sampling_params
=
sampling_params
self
.
sampling_params
=
sampling_params
self
.
arrival_time
=
arrival_time
self
.
metrics
=
RequestMetrics
(
arrival_time
=
arrival_time
,
self
.
last_token_time
=
arrival_time
last_token_time
=
arrival_time
,
first_scheduled_time
=
None
,
first_token_time
=
None
,
time_in_queue
=
None
)
self
.
lora_request
=
lora_request
self
.
lora_request
=
lora_request
self
.
prefix
:
Optional
[
Prefix
]
=
prefix
self
.
prefix
:
Optional
[
Prefix
]
=
prefix
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
...
@@ -276,10 +299,25 @@ class SequenceGroup:
...
@@ -276,10 +299,25 @@ class SequenceGroup:
def
get_last_latency
(
self
,
now
:
float
)
->
float
:
def
get_last_latency
(
self
,
now
:
float
)
->
float
:
"""Gets last token latency for Request level timings."""
"""Gets last token latency for Request level timings."""
latency
=
now
-
self
.
last_token_time
latency
=
now
-
self
.
metrics
.
last_token_time
self
.
last_token_time
=
now
self
.
metrics
.
last_token_time
=
now
return
latency
return
latency
def
maybe_set_first_token_time
(
self
,
time
:
float
)
->
None
:
"""Sets the first token time for Request level timings."""
if
self
.
metrics
.
first_token_time
is
None
:
self
.
metrics
.
first_token_time
=
time
def
maybe_set_first_scheduled_time
(
self
,
time
:
float
)
->
None
:
"""Sets the first scheduled time and time in queue for Request level timings."""
if
self
.
metrics
.
first_scheduled_time
is
None
:
self
.
metrics
.
first_scheduled_time
=
time
self
.
metrics
.
time_in_queue
=
time
-
self
.
metrics
.
arrival_time
def
set_finished_time
(
self
,
time
:
Optional
[
float
])
->
None
:
"""Sets the finished time for Request level timings."""
self
.
metrics
.
finished_time
=
time
def
get_max_num_running_seqs
(
self
)
->
int
:
def
get_max_num_running_seqs
(
self
)
->
int
:
"""The maximum number of sequences running in parallel in the remaining
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
lifetime of the request."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment