Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e3319577
Unverified
Commit
e3319577
authored
May 10, 2023
by
Woosuk Kwon
Committed by
GitHub
May 10, 2023
Browse files
Log system stats (#90)
parent
8d66a7b6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
53 additions
and
136 deletions
+53
-136
cacheflow/core/scheduler.py
cacheflow/core/scheduler.py
+48
-129
cacheflow/core/server.py
cacheflow/core/server.py
+5
-6
cacheflow/worker/worker.py
cacheflow/worker/worker.py
+0
-1
No files found.
cacheflow/core/scheduler.py
View file @
e3319577
...
@@ -5,6 +5,7 @@ import time
...
@@ -5,6 +5,7 @@ import time
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.core.block_manager
import
BlockSpaceManager
from
cacheflow.core.block_manager
import
BlockSpaceManager
from
cacheflow.logger
import
init_logger
from
cacheflow.core.policy
import
PolicyFactory
from
cacheflow.core.policy
import
PolicyFactory
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
from
cacheflow.sequence
import
Sequence
...
@@ -14,6 +15,10 @@ from cacheflow.sequence import SequenceOutputs
...
@@ -14,6 +15,10 @@ from cacheflow.sequence import SequenceOutputs
from
cacheflow.sequence
import
SequenceStatus
from
cacheflow.sequence
import
SequenceStatus
logger
=
init_logger
(
__name__
)
_LOGGING_INTERVAL_SEC
=
10
class
PreemptionMode
(
enum
.
Enum
):
class
PreemptionMode
(
enum
.
Enum
):
"""Preemption modes.
"""Preemption modes.
...
@@ -37,8 +42,7 @@ class Scheduler:
...
@@ -37,8 +42,7 @@ class Scheduler:
num_cpu_blocks
:
int
,
num_cpu_blocks
:
int
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
max_num_sequences
:
int
,
collect_stats
:
bool
,
log_stats
:
bool
,
do_memory_analysis
:
bool
=
False
,
)
->
None
:
)
->
None
:
self
.
controllers
=
controllers
self
.
controllers
=
controllers
self
.
block_size
=
block_size
self
.
block_size
=
block_size
...
@@ -46,8 +50,7 @@ class Scheduler:
...
@@ -46,8 +50,7 @@ class Scheduler:
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_sequences
=
max_num_sequences
self
.
max_num_sequences
=
max_num_sequences
self
.
collect_stats
=
collect_stats
self
.
log_stats
=
log_stats
self
.
do_memory_analysis
=
do_memory_analysis
# Instantiate the scheduling policy.
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
...
@@ -69,8 +72,9 @@ class Scheduler:
...
@@ -69,8 +72,9 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
# Performance-related statistics.
self
.
last_logging_time
:
float
=
0.0
self
.
stats
=
Stats
(
num_gpu_blocks
,
num_cpu_blocks
)
# List[timestamp, num_tokens]
self
.
num_input_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
def
add_sequence_groups
(
def
add_sequence_groups
(
self
,
self
,
...
@@ -186,59 +190,46 @@ class Scheduler:
...
@@ -186,59 +190,46 @@ class Scheduler:
num_batched_tokens
+=
num_prompt_tokens
num_batched_tokens
+=
num_prompt_tokens
prompt_group_ids
.
append
(
seq_group
.
group_id
)
prompt_group_ids
.
append
(
seq_group
.
group_id
)
if
self
.
collect_stats
:
if
not
self
.
log_stats
:
if
self
.
running
or
blocks_to_swap_in
or
blocks_to_swap_out
:
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
self
.
stats
.
timestamps
.
append
(
now
-
self
.
stats
.
start_time
)
prompt_group_ids
)
self
.
stats
.
input_lens
.
append
(
num_batched_tokens
)
self
.
stats
.
swap_out_lens
.
append
(
len
(
blocks_to_swap_out
)
*
self
.
block_size
)
now
=
time
.
time
()
self
.
stats
.
swap_in_lens
.
append
(
len
(
blocks_to_swap_in
)
*
self
.
block_size
)
if
num_batched_tokens
>
0
:
self
.
stats
.
num_preemption
.
append
(
len
(
preempted
))
self
.
num_input_tokens
.
append
((
now
,
num_batched_tokens
))
self
.
stats
.
num_swapped
.
append
(
len
(
self
.
swapped
))
elapsed_time
=
now
-
self
.
last_logging_time
self
.
stats
.
num_running
.
append
(
len
(
self
.
running
))
if
elapsed_time
>
_LOGGING_INTERVAL_SEC
:
self
.
stats
.
num_waiting
.
append
(
len
(
self
.
waiting
))
self
.
last_logging_time
=
now
self
.
num_input_tokens
=
[
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
(
t
,
n
)
for
t
,
n
in
self
.
num_input_tokens
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
if
now
-
t
<
_LOGGING_INTERVAL_SEC
self
.
stats
.
gpu_cache_usage
.
append
(
num_used_gpu_blocks
/
self
.
num_gpu_blocks
)
]
if
len
(
self
.
num_input_tokens
)
>
1
:
total_num_tokens
=
sum
(
n
for
_
,
n
in
self
.
num_input_tokens
[:
-
1
])
window
=
now
-
self
.
num_input_tokens
[
0
][
0
]
avg_throughput
=
total_num_tokens
/
window
else
:
avg_throughput
=
0.0
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
gpu_cache_usage
=
num_used_gpu_blocks
/
self
.
num_gpu_blocks
if
self
.
num_cpu_blocks
>
0
:
num_free_cpu_blocks
=
self
.
block_manager
.
get_num_free_cpu_blocks
()
num_free_cpu_blocks
=
self
.
block_manager
.
get_num_free_cpu_blocks
()
num_used_cpu_blocks
=
self
.
num_cpu_blocks
-
num_free_cpu_blocks
num_used_cpu_blocks
=
self
.
num_cpu_blocks
-
num_free_cpu_blocks
self
.
stats
.
cpu_cache_usage
.
append
(
num_used_cpu_blocks
/
self
.
num_cpu_blocks
)
cpu_cache_usage
=
num_used_cpu_blocks
/
self
.
num_cpu_blocks
else
:
if
self
.
do_memory_analysis
:
cpu_cache_usage
=
0.0
block_tables
=
self
.
block_manager
.
block_tables
num_logical_blocks
=
0
logger
.
info
(
num_logical_tokens
=
0
f
"Throughput:
{
avg_throughput
:.
1
f
}
tokens/s, "
num_physical_blocks
=
0
f
"Running:
{
len
(
self
.
running
)
}
reqs, "
num_physical_tokens
=
0
f
"Swapped:
{
len
(
self
.
swapped
)
}
reqs, "
physical_block_numbers
=
set
()
f
"Pending:
{
len
(
self
.
waiting
)
}
reqs, "
num_reserved_tokens
=
0
f
"GPU KV cache usage:
{
gpu_cache_usage
*
100
:.
1
f
}
%, "
for
seq_group
in
self
.
running
:
f
"CPU KV cache usage:
{
cpu_cache_usage
*
100
:.
1
f
}
%"
)
group_id
=
seq_group
.
group_id
sampling_params
=
self
.
sampling_params
[
group_id
]
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
max_num_steps
=
sampling_params
.
max_num_steps
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
num_logical_blocks
+=
len
(
seq
.
logical_token_blocks
)
num_logical_tokens
+=
seq
.
get_len
()
seq_id
=
seq
.
seq_id
block_table
=
block_tables
[
seq_id
]
for
i
,
block
in
enumerate
(
block_table
):
if
block
.
block_number
in
physical_block_numbers
:
continue
physical_block_numbers
.
add
(
block
.
block_number
)
num_physical_blocks
+=
1
num_physical_tokens
+=
seq
.
logical_token_blocks
[
i
].
num_tokens
assert
num_physical_blocks
==
num_used_gpu_blocks
self
.
stats
.
num_logical_blocks
.
append
(
num_logical_blocks
)
self
.
stats
.
num_logical_tokens
.
append
(
num_logical_tokens
)
self
.
stats
.
num_physical_blocks
.
append
(
num_physical_blocks
)
self
.
stats
.
num_physical_tokens
.
append
(
num_physical_tokens
)
self
.
stats
.
num_reserved_tokens
.
append
(
num_reserved_tokens
)
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
prompt_group_ids
)
prompt_group_ids
)
def
step
(
self
)
->
List
[
SequenceGroup
]:
def
step
(
self
)
->
List
[
SequenceGroup
]:
...
@@ -455,75 +446,3 @@ class Scheduler:
...
@@ -455,75 +446,3 @@ class Scheduler:
blocks_to_swap_out
.
update
(
mapping
)
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
seq
.
status
=
SequenceStatus
.
SWAPPED
def
reset_stats
(
self
)
->
None
:
self
.
stats
.
reset
(
self
.
num_gpu_blocks
,
self
.
num_cpu_blocks
)
def
save_stats
(
self
,
output_dir
:
str
,
)
->
None
:
assert
self
.
collect_stats
,
'Statistics collection is disabled.'
self
.
stats
.
save
(
output_dir
)
class
Stats
:
def
__init__
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
start_time
:
float
=
time
.
time
()
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
timestamps
:
List
[
float
]
=
[]
self
.
input_lens
:
List
[
int
]
=
[]
self
.
swap_out_lens
:
List
[
int
]
=
[]
self
.
swap_in_lens
:
List
[
int
]
=
[]
self
.
num_preemption
:
List
[
int
]
=
[]
self
.
num_waiting
:
List
[
int
]
=
[]
self
.
num_running
:
List
[
int
]
=
[]
self
.
num_swapped
:
List
[
int
]
=
[]
self
.
gpu_cache_usage
:
List
[
float
]
=
[]
self
.
cpu_cache_usage
:
List
[
float
]
=
[]
self
.
num_logical_blocks
:
List
[
int
]
=
[]
self
.
num_logical_tokens
:
List
[
int
]
=
[]
self
.
num_physical_blocks
:
List
[
int
]
=
[]
self
.
num_physical_tokens
:
List
[
int
]
=
[]
self
.
num_reserved_tokens
:
List
[
int
]
=
[]
def
reset
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
__init__
(
num_gpu_blocks
,
num_cpu_blocks
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
return
{
'start_time'
:
self
.
start_time
,
'num_gpu_blocks'
:
self
.
num_gpu_blocks
,
'num_cpu_blocks'
:
self
.
num_cpu_blocks
,
'timestamps'
:
self
.
timestamps
,
'input_lens'
:
self
.
input_lens
,
'swap_out_lens'
:
self
.
swap_out_lens
,
'swap_in_lens'
:
self
.
swap_in_lens
,
'num_preemption'
:
self
.
num_preemption
,
'num_waiting'
:
self
.
num_waiting
,
'num_running'
:
self
.
num_running
,
'num_swapped'
:
self
.
num_swapped
,
'gpu_cache_usage'
:
self
.
gpu_cache_usage
,
'cpu_cache_usage'
:
self
.
cpu_cache_usage
,
'num_logical_blocks'
:
self
.
num_logical_blocks
,
'num_logical_tokens'
:
self
.
num_logical_tokens
,
'num_physical_blocks'
:
self
.
num_physical_blocks
,
'num_physical_tokens'
:
self
.
num_physical_tokens
,
'num_reserved_tokens'
:
self
.
num_reserved_tokens
,
}
def
save
(
self
,
output_dir
:
str
)
->
None
:
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
self
.
to_dict
(),
f
)
cacheflow/core/server.py
View file @
e3319577
...
@@ -44,18 +44,16 @@ class Server:
...
@@ -44,18 +44,16 @@ class Server:
gpu_memory
:
int
,
gpu_memory
:
int
,
cpu_memory
:
int
,
cpu_memory
:
int
,
use_ray
:
bool
,
use_ray
:
bool
,
collect_stats
:
bool
=
False
,
log_stats
:
bool
,
do_memory_analysis
:
bool
=
False
,
):
):
logger
.
info
(
logger
.
info
(
"Initializing a server with config: "
"Initializing a server with config: "
f
"model=
{
model
!
r
}
, "
f
"model=
{
model
!
r
}
, "
f
"dtype=
{
dtype
}
, "
f
"dtype=
{
dtype
}
, "
f
"use_dummy_weights=
{
use_dummy_weights
}
, "
f
"use_dummy_weights=
{
use_dummy_weights
}
, "
f
"cache_dir=
{
cache_dir
}
, "
f
"cache_dir=
{
cache_dir
!
r
}
, "
f
"use_np_cache=
{
use_np_cache
}
, "
f
"use_np_cache=
{
use_np_cache
}
, "
f
"tensor_parallel_size=
{
tensor_parallel_size
}
, "
f
"tensor_parallel_size=
{
tensor_parallel_size
}
, "
f
"block_size=
{
block_size
}
, "
f
"seed=
{
seed
}
)"
f
"seed=
{
seed
}
)"
)
)
self
.
num_nodes
=
num_nodes
self
.
num_nodes
=
num_nodes
...
@@ -111,8 +109,7 @@ class Server:
...
@@ -111,8 +109,7 @@ class Server:
num_cpu_blocks
=
self
.
num_cpu_blocks
,
num_cpu_blocks
=
self
.
num_cpu_blocks
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_sequences
=
max_num_sequences
,
max_num_sequences
=
max_num_sequences
,
collect_stats
=
collect_stats
,
log_stats
=
log_stats
,
do_memory_analysis
=
do_memory_analysis
,
)
)
# Connect the controllers.
# Connect the controllers.
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
...
@@ -244,6 +241,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
...
@@ -244,6 +241,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens per iteration'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--log-stats'
,
action
=
'store_true'
,
help
=
'log system statistics'
)
return
parser
return
parser
...
@@ -286,6 +284,7 @@ def init_local_server_and_frontend_with_arguments(args: argparse.Namespace):
...
@@ -286,6 +284,7 @@ def init_local_server_and_frontend_with_arguments(args: argparse.Namespace):
gpu_memory
=
get_gpu_memory
(),
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
args
.
use_ray
,
use_ray
=
args
.
use_ray
,
log_stats
=
args
.
log_stats
,
)
)
# Create a frontend.
# Create a frontend.
...
...
cacheflow/worker/worker.py
View file @
e3319577
...
@@ -91,7 +91,6 @@ class Worker:
...
@@ -91,7 +91,6 @@ class Worker:
initialize_model_parallel
(
tensor_parallel_size
,
initialize_model_parallel
(
tensor_parallel_size
,
pipeline_parallel_size
)
pipeline_parallel_size
)
def
prepare_inputs
(
def
prepare_inputs
(
self
,
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment