Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e3319577
"src/diffusers/utils/dummy_onnx_objects.py" did not exist on "836f3f35c2453537ec86b3262c8c197c5d4a2767"
Unverified
Commit
e3319577
authored
May 10, 2023
by
Woosuk Kwon
Committed by
GitHub
May 10, 2023
Browse files
Log system stats (#90)
parent
8d66a7b6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
53 additions
and
136 deletions
+53
-136
cacheflow/core/scheduler.py
cacheflow/core/scheduler.py
+48
-129
cacheflow/core/server.py
cacheflow/core/server.py
+5
-6
cacheflow/worker/worker.py
cacheflow/worker/worker.py
+0
-1
No files found.
cacheflow/core/scheduler.py
View file @
e3319577
...
...
@@ -5,6 +5,7 @@ import time
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.core.block_manager
import
BlockSpaceManager
from
cacheflow.logger
import
init_logger
from
cacheflow.core.policy
import
PolicyFactory
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
...
...
@@ -14,6 +15,10 @@ from cacheflow.sequence import SequenceOutputs
from
cacheflow.sequence
import
SequenceStatus
logger
=
init_logger
(
__name__
)
_LOGGING_INTERVAL_SEC
=
10
class
PreemptionMode
(
enum
.
Enum
):
"""Preemption modes.
...
...
@@ -37,8 +42,7 @@ class Scheduler:
num_cpu_blocks
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
collect_stats
:
bool
,
do_memory_analysis
:
bool
=
False
,
log_stats
:
bool
,
)
->
None
:
self
.
controllers
=
controllers
self
.
block_size
=
block_size
...
...
@@ -46,8 +50,7 @@ class Scheduler:
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_sequences
=
max_num_sequences
self
.
collect_stats
=
collect_stats
self
.
do_memory_analysis
=
do_memory_analysis
self
.
log_stats
=
log_stats
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
...
...
@@ -69,8 +72,9 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
# Performance-related statistics.
self
.
stats
=
Stats
(
num_gpu_blocks
,
num_cpu_blocks
)
self
.
last_logging_time
:
float
=
0.0
# List[timestamp, num_tokens]
self
.
num_input_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
def
add_sequence_groups
(
self
,
...
...
@@ -186,59 +190,46 @@ class Scheduler:
num_batched_tokens
+=
num_prompt_tokens
prompt_group_ids
.
append
(
seq_group
.
group_id
)
if
self
.
collect_stats
:
if
self
.
running
or
blocks_to_swap_in
or
blocks_to_swap_out
:
self
.
stats
.
timestamps
.
append
(
now
-
self
.
stats
.
start_time
)
self
.
stats
.
input_lens
.
append
(
num_batched_tokens
)
self
.
stats
.
swap_out_lens
.
append
(
len
(
blocks_to_swap_out
)
*
self
.
block_size
)
self
.
stats
.
swap_in_lens
.
append
(
len
(
blocks_to_swap_in
)
*
self
.
block_size
)
self
.
stats
.
num_preemption
.
append
(
len
(
preempted
))
self
.
stats
.
num_swapped
.
append
(
len
(
self
.
swapped
))
self
.
stats
.
num_running
.
append
(
len
(
self
.
running
))
self
.
stats
.
num_waiting
.
append
(
len
(
self
.
waiting
))
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
self
.
stats
.
gpu_cache_usage
.
append
(
num_used_gpu_blocks
/
self
.
num_gpu_blocks
)
if
not
self
.
log_stats
:
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
prompt_group_ids
)
now
=
time
.
time
()
if
num_batched_tokens
>
0
:
self
.
num_input_tokens
.
append
((
now
,
num_batched_tokens
))
elapsed_time
=
now
-
self
.
last_logging_time
if
elapsed_time
>
_LOGGING_INTERVAL_SEC
:
self
.
last_logging_time
=
now
self
.
num_input_tokens
=
[
(
t
,
n
)
for
t
,
n
in
self
.
num_input_tokens
if
now
-
t
<
_LOGGING_INTERVAL_SEC
]
if
len
(
self
.
num_input_tokens
)
>
1
:
total_num_tokens
=
sum
(
n
for
_
,
n
in
self
.
num_input_tokens
[:
-
1
])
window
=
now
-
self
.
num_input_tokens
[
0
][
0
]
avg_throughput
=
total_num_tokens
/
window
else
:
avg_throughput
=
0.0
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
gpu_cache_usage
=
num_used_gpu_blocks
/
self
.
num_gpu_blocks
if
self
.
num_cpu_blocks
>
0
:
num_free_cpu_blocks
=
self
.
block_manager
.
get_num_free_cpu_blocks
()
num_used_cpu_blocks
=
self
.
num_cpu_blocks
-
num_free_cpu_blocks
self
.
stats
.
cpu_cache_usage
.
append
(
num_used_cpu_blocks
/
self
.
num_cpu_blocks
)
if
self
.
do_memory_analysis
:
block_tables
=
self
.
block_manager
.
block_tables
num_logical_blocks
=
0
num_logical_tokens
=
0
num_physical_blocks
=
0
num_physical_tokens
=
0
physical_block_numbers
=
set
()
num_reserved_tokens
=
0
for
seq_group
in
self
.
running
:
group_id
=
seq_group
.
group_id
sampling_params
=
self
.
sampling_params
[
group_id
]
max_num_steps
=
sampling_params
.
max_num_steps
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
num_logical_blocks
+=
len
(
seq
.
logical_token_blocks
)
num_logical_tokens
+=
seq
.
get_len
()
seq_id
=
seq
.
seq_id
block_table
=
block_tables
[
seq_id
]
for
i
,
block
in
enumerate
(
block_table
):
if
block
.
block_number
in
physical_block_numbers
:
continue
physical_block_numbers
.
add
(
block
.
block_number
)
num_physical_blocks
+=
1
num_physical_tokens
+=
seq
.
logical_token_blocks
[
i
].
num_tokens
assert
num_physical_blocks
==
num_used_gpu_blocks
self
.
stats
.
num_logical_blocks
.
append
(
num_logical_blocks
)
self
.
stats
.
num_logical_tokens
.
append
(
num_logical_tokens
)
self
.
stats
.
num_physical_blocks
.
append
(
num_physical_blocks
)
self
.
stats
.
num_physical_tokens
.
append
(
num_physical_tokens
)
self
.
stats
.
num_reserved_tokens
.
append
(
num_reserved_tokens
)
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
cpu_cache_usage
=
num_used_cpu_blocks
/
self
.
num_cpu_blocks
else
:
cpu_cache_usage
=
0.0
logger
.
info
(
f
"Throughput:
{
avg_throughput
:.
1
f
}
tokens/s, "
f
"Running:
{
len
(
self
.
running
)
}
reqs, "
f
"Swapped:
{
len
(
self
.
swapped
)
}
reqs, "
f
"Pending:
{
len
(
self
.
waiting
)
}
reqs, "
f
"GPU KV cache usage:
{
gpu_cache_usage
*
100
:.
1
f
}
%, "
f
"CPU KV cache usage:
{
cpu_cache_usage
*
100
:.
1
f
}
%"
)
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
prompt_group_ids
)
def
step
(
self
)
->
List
[
SequenceGroup
]:
...
...
@@ -455,75 +446,3 @@ class Scheduler:
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
def
reset_stats
(
self
)
->
None
:
self
.
stats
.
reset
(
self
.
num_gpu_blocks
,
self
.
num_cpu_blocks
)
def
save_stats
(
self
,
output_dir
:
str
,
)
->
None
:
assert
self
.
collect_stats
,
'Statistics collection is disabled.'
self
.
stats
.
save
(
output_dir
)
class
Stats
:
def
__init__
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
start_time
:
float
=
time
.
time
()
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
timestamps
:
List
[
float
]
=
[]
self
.
input_lens
:
List
[
int
]
=
[]
self
.
swap_out_lens
:
List
[
int
]
=
[]
self
.
swap_in_lens
:
List
[
int
]
=
[]
self
.
num_preemption
:
List
[
int
]
=
[]
self
.
num_waiting
:
List
[
int
]
=
[]
self
.
num_running
:
List
[
int
]
=
[]
self
.
num_swapped
:
List
[
int
]
=
[]
self
.
gpu_cache_usage
:
List
[
float
]
=
[]
self
.
cpu_cache_usage
:
List
[
float
]
=
[]
self
.
num_logical_blocks
:
List
[
int
]
=
[]
self
.
num_logical_tokens
:
List
[
int
]
=
[]
self
.
num_physical_blocks
:
List
[
int
]
=
[]
self
.
num_physical_tokens
:
List
[
int
]
=
[]
self
.
num_reserved_tokens
:
List
[
int
]
=
[]
def
reset
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
__init__
(
num_gpu_blocks
,
num_cpu_blocks
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
return
{
'start_time'
:
self
.
start_time
,
'num_gpu_blocks'
:
self
.
num_gpu_blocks
,
'num_cpu_blocks'
:
self
.
num_cpu_blocks
,
'timestamps'
:
self
.
timestamps
,
'input_lens'
:
self
.
input_lens
,
'swap_out_lens'
:
self
.
swap_out_lens
,
'swap_in_lens'
:
self
.
swap_in_lens
,
'num_preemption'
:
self
.
num_preemption
,
'num_waiting'
:
self
.
num_waiting
,
'num_running'
:
self
.
num_running
,
'num_swapped'
:
self
.
num_swapped
,
'gpu_cache_usage'
:
self
.
gpu_cache_usage
,
'cpu_cache_usage'
:
self
.
cpu_cache_usage
,
'num_logical_blocks'
:
self
.
num_logical_blocks
,
'num_logical_tokens'
:
self
.
num_logical_tokens
,
'num_physical_blocks'
:
self
.
num_physical_blocks
,
'num_physical_tokens'
:
self
.
num_physical_tokens
,
'num_reserved_tokens'
:
self
.
num_reserved_tokens
,
}
def
save
(
self
,
output_dir
:
str
)
->
None
:
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
self
.
to_dict
(),
f
)
cacheflow/core/server.py
View file @
e3319577
...
...
@@ -44,18 +44,16 @@ class Server:
gpu_memory
:
int
,
cpu_memory
:
int
,
use_ray
:
bool
,
collect_stats
:
bool
=
False
,
do_memory_analysis
:
bool
=
False
,
log_stats
:
bool
,
):
logger
.
info
(
"Initializing a server with config: "
f
"model=
{
model
!
r
}
, "
f
"dtype=
{
dtype
}
, "
f
"use_dummy_weights=
{
use_dummy_weights
}
, "
f
"cache_dir=
{
cache_dir
}
, "
f
"cache_dir=
{
cache_dir
!
r
}
, "
f
"use_np_cache=
{
use_np_cache
}
, "
f
"tensor_parallel_size=
{
tensor_parallel_size
}
, "
f
"block_size=
{
block_size
}
, "
f
"seed=
{
seed
}
)"
)
self
.
num_nodes
=
num_nodes
...
...
@@ -111,8 +109,7 @@ class Server:
num_cpu_blocks
=
self
.
num_cpu_blocks
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_sequences
=
max_num_sequences
,
collect_stats
=
collect_stats
,
do_memory_analysis
=
do_memory_analysis
,
log_stats
=
log_stats
,
)
# Connect the controllers.
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
...
...
@@ -244,6 +241,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--log-stats'
,
action
=
'store_true'
,
help
=
'log system statistics'
)
return
parser
...
...
@@ -286,6 +284,7 @@ def init_local_server_and_frontend_with_arguments(args: argparse.Namespace):
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
args
.
use_ray
,
log_stats
=
args
.
log_stats
,
)
# Create a frontend.
...
...
cacheflow/worker/worker.py
View file @
e3319577
...
...
@@ -91,7 +91,6 @@ class Worker:
initialize_model_parallel
(
tensor_parallel_size
,
pipeline_parallel_size
)
def
prepare_inputs
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment