Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
017d9f15
"tools/test_ins_vis.py" did not exist on "0d5233a32dac4288760f6d819ddfef9c1e020cf7"
Unverified
Commit
017d9f15
authored
Feb 20, 2024
by
Antoni Baum
Committed by
GitHub
Feb 20, 2024
Browse files
Add metrics to RequestOutput (#2876)
parent
181b27d8
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
61 additions
and
9 deletions
+61
-9
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+1
-1
vllm/core/policy.py
vllm/core/policy.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+3
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+5
-2
vllm/outputs.py
vllm/outputs.py
+9
-1
vllm/sequence.py
vllm/sequence.py
+42
-4
No files found.
tests/async_engine/test_request_tracker.py
View file @
017d9f15
...
...
@@ -64,7 +64,7 @@ def test_request_tracker():
stream_5
=
tracker
.
add_request
(
"5"
)
assert
tracker
.
new_requests_event
.
flag
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
bool
(
finished
)
))
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
finished
)
==
1
...
...
vllm/core/policy.py
View file @
017d9f15
...
...
@@ -33,7 +33,7 @@ class FCFS(Policy):
now
:
float
,
seq_group
:
SequenceGroup
,
)
->
float
:
return
now
-
seq_group
.
arrival_time
return
now
-
seq_group
.
metrics
.
arrival_time
class
PolicyFactory
:
...
...
vllm/core/scheduler.py
View file @
017d9f15
...
...
@@ -365,10 +365,13 @@ class Scheduler:
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_outputs
=
self
.
_schedule
()
now
=
time
.
time
()
# Create input data structures.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
seq_group
.
maybe_set_first_scheduled_time
(
now
)
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
block_tables
:
Dict
[
int
,
List
[
int
]]
=
{}
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
...
...
vllm/engine/llm_engine.py
View file @
017d9f15
...
...
@@ -728,6 +728,7 @@ class LLMEngine:
def
_process_model_outputs
(
self
,
output
:
SamplerOutput
,
scheduler_outputs
:
SchedulerOutputs
)
->
List
[
RequestOutput
]:
now
=
time
.
time
()
# Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups
=
scheduler_outputs
.
scheduled_seq_groups
for
seq_group
,
outputs
in
zip
(
scheduled_seq_groups
,
output
):
...
...
@@ -739,6 +740,7 @@ class LLMEngine:
# Create the outputs.
request_outputs
:
List
[
RequestOutput
]
=
[]
for
seq_group
in
scheduled_seq_groups
:
seq_group
.
maybe_set_first_token_time
(
now
)
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_outputs
.
append
(
request_output
)
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
:
...
...
@@ -876,11 +878,12 @@ class LLMEngine:
# Latency Timings.
time_last_iters
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
# Time since last token. (n.b. updates seq_group.last_token_time)
# Time since last token. (n.b. updates seq_group.
metrics.
last_token_time)
time_last_iters
.
append
(
seq_group
.
get_last_latency
(
now
))
# Time since arrival for all finished requests.
if
seq_group
.
is_finished
():
time_e2e_requests
.
append
(
now
-
seq_group
.
arrival_time
)
time_e2e_requests
.
append
(
now
-
seq_group
.
metrics
.
arrival_time
)
time_to_first_tokens
=
time_last_iters
if
prompt_run
else
[]
time_per_output_tokens
=
[]
if
prompt_run
else
time_last_iters
...
...
vllm/outputs.py
View file @
017d9f15
from
typing
import
List
,
Optional
import
time
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SequenceGroup
,
SequenceStatus
)
SequenceStatus
,
RequestMetrics
)
from
vllm.lora.request
import
LoRARequest
...
...
@@ -60,6 +61,7 @@ class RequestOutput:
prompt_logprobs: The log probabilities to return per prompt token.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
metrics: Metrics associated with the request.
lora_request: The LoRA request that was used to generate the output.
"""
...
...
@@ -71,6 +73,7 @@ class RequestOutput:
prompt_logprobs
:
Optional
[
PromptLogprobs
],
outputs
:
List
[
CompletionOutput
],
finished
:
bool
,
metrics
:
Optional
[
RequestMetrics
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
)
->
None
:
self
.
request_id
=
request_id
...
...
@@ -79,6 +82,7 @@ class RequestOutput:
self
.
prompt_logprobs
=
prompt_logprobs
self
.
outputs
=
outputs
self
.
finished
=
finished
self
.
metrics
=
metrics
self
.
lora_request
=
lora_request
@
classmethod
...
...
@@ -115,12 +119,15 @@ class RequestOutput:
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_logprobs
=
seq_group
.
prompt_logprobs
finished
=
seq_group
.
is_finished
()
finished_time
=
time
.
time
()
if
finished
else
None
seq_group
.
set_finished_time
(
finished_time
)
return
cls
(
seq_group
.
request_id
,
prompt
,
prompt_token_ids
,
prompt_logprobs
,
outputs
,
finished
,
seq_group
.
metrics
,
lora_request
=
seq_group
.
lora_request
)
def
__repr__
(
self
)
->
str
:
...
...
@@ -130,4 +137,5 @@ class RequestOutput:
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"outputs=
{
self
.
outputs
}
, "
f
"finished=
{
self
.
finished
}
, "
f
"metrics=
{
self
.
metrics
}
, "
f
"lora_request=
{
self
.
lora_request
}
)"
)
vllm/sequence.py
View file @
017d9f15
"""Sequence and its related classes."""
import
copy
import
enum
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Union
from
vllm.block
import
LogicalTokenBlock
...
...
@@ -49,6 +50,25 @@ class SequenceStatus(enum.Enum):
return
finish_reason
@
dataclass
class
RequestMetrics
:
"""Metrics associated with a request.
Args:
arrival_time: The time when the request arrived.
first_scheduled_time: The time when the request was first scheduled.
first_token_time: The time when the first token was generated.
time_in_queue: The time the request spent in the queue.
finished_time: The time when the request was finished.
"""
arrival_time
:
float
last_token_time
:
float
first_scheduled_time
:
Optional
[
float
]
first_token_time
:
Optional
[
float
]
time_in_queue
:
Optional
[
float
]
finished_time
:
Optional
[
float
]
=
None
class
SequenceData
:
"""Data associated with a sequence.
...
...
@@ -252,8 +272,11 @@ class SequenceGroup:
self
.
request_id
=
request_id
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
sampling_params
=
sampling_params
self
.
arrival_time
=
arrival_time
self
.
last_token_time
=
arrival_time
self
.
metrics
=
RequestMetrics
(
arrival_time
=
arrival_time
,
last_token_time
=
arrival_time
,
first_scheduled_time
=
None
,
first_token_time
=
None
,
time_in_queue
=
None
)
self
.
lora_request
=
lora_request
self
.
prefix
:
Optional
[
Prefix
]
=
prefix
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
...
...
@@ -276,10 +299,25 @@ class SequenceGroup:
def
get_last_latency
(
self
,
now
:
float
)
->
float
:
"""Gets last token latency for Request level timings."""
latency
=
now
-
self
.
last_token_time
self
.
last_token_time
=
now
latency
=
now
-
self
.
metrics
.
last_token_time
self
.
metrics
.
last_token_time
=
now
return
latency
def
maybe_set_first_token_time
(
self
,
time
:
float
)
->
None
:
"""Sets the first token time for Request level timings."""
if
self
.
metrics
.
first_token_time
is
None
:
self
.
metrics
.
first_token_time
=
time
def
maybe_set_first_scheduled_time
(
self
,
time
:
float
)
->
None
:
"""Sets the first scheduled time and time in queue for Request level timings."""
if
self
.
metrics
.
first_scheduled_time
is
None
:
self
.
metrics
.
first_scheduled_time
=
time
self
.
metrics
.
time_in_queue
=
time
-
self
.
metrics
.
arrival_time
def
set_finished_time
(
self
,
time
:
Optional
[
float
])
->
None
:
"""Sets the finished time for Request level timings."""
self
.
metrics
.
finished_time
=
time
def
get_max_num_running_seqs
(
self
)
->
int
:
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment