Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
84eee24e
Unverified
Commit
84eee24e
authored
Apr 12, 2023
by
Woosuk Kwon
Committed by
GitHub
Apr 12, 2023
Browse files
Collect system stats in scheduler & Add scripts for experiments (#30)
parent
e3cec88a
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
830 additions
and
3 deletions
+830
-3
.gitignore
.gitignore
+4
-0
benchmark/benchmark_latency.py
benchmark/benchmark_latency.py
+1
-0
benchmark/benchmark_text_completion.py
benchmark/benchmark_text_completion.py
+289
-0
benchmark/trace.py
benchmark/trace.py
+116
-0
cacheflow/master/block_manager.py
cacheflow/master/block_manager.py
+6
-0
cacheflow/master/scheduler.py
cacheflow/master/scheduler.py
+146
-1
cacheflow/master/server.py
cacheflow/master/server.py
+8
-1
cacheflow/master/simple_frontend.py
cacheflow/master/simple_frontend.py
+3
-1
cacheflow/sequence.py
cacheflow/sequence.py
+1
-0
plot/plot_normalized_latency.py
plot/plot_normalized_latency.py
+203
-0
plot/plot_stats.py
plot/plot_stats.py
+52
-0
simple_server.py
simple_server.py
+1
-0
No files found.
.gitignore
View file @
84eee24e
...
...
@@ -4,3 +4,7 @@
*.eggs/
*.so
build/
*.pkl
*.png
**/log.txt
benchmark/benchmark_latency.py
View file @
84eee24e
...
...
@@ -37,6 +37,7 @@ def main(args: argparse.Namespace):
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
...
...
benchmark/benchmark_text_completion.py
0 → 100644
View file @
84eee24e
import
argparse
import
logging
import
os
import
pickle
import
time
from
typing
import
List
from
tqdm
import
tqdm
from
transformers
import
AutoConfig
from
benchmark.trace
import
generate_text_completion_requests
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
logger
=
logging
.
getLogger
(
__name__
)
def
main
(
args
:
argparse
.
Namespace
):
assert
args
.
pipeline_parallel_size
==
1
,
(
'Pipeline parallelism is not supported yet.'
)
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
initialize_ray_cluster
(
address
=
'local'
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
# Create a server.
server
=
Server
(
model
=
args
.
model
,
model_path
=
args
.
model_path
,
use_dummy_weights
=
args
.
use_dummy_weights
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
block_size
=
args
.
block_size
,
dtype
=
args
.
dtype
,
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
collect_stats
=
True
,
do_memory_analysis
=
args
.
do_memory_analysis
,
)
# Create a frontend.
frontend
=
SimpleFrontend
(
model_name
=
args
.
model
,
block_size
=
args
.
block_size
,
)
# Generate requests.
requests
=
generate_text_completion_requests
(
args
.
dataset
,
args
.
request_rate
,
args
.
duration
,
args
.
seed
,
args
.
n1
,
args
.
n2
,
args
.
n3
,
args
.
n4
,
args
.
n6
,
args
.
n2_beam
,
args
.
n4_beam
,
args
.
n6_beam
,
args
.
n8_beam
,
)
# Warm up.
logger
.
info
(
'Warming up.'
)
num_warmup_requests
=
8
warmup_input_len
=
8
warmup_output_len
=
32
warmup_sampling_params
=
SamplingParams
(
n
=
1
,
temperature
=
1.0
,
top_p
=
0.99
,
max_num_steps
=
warmup_output_len
,
use_beam_search
=
False
,
stop_token_ids
=
set
(),
num_logprobs
=
0
,
context_window_size
=
None
,
)
for
_
in
range
(
num_warmup_requests
):
frontend
.
_add_query
([
0
]
*
warmup_input_len
,
warmup_sampling_params
)
server
.
add_sequence_groups
(
frontend
.
get_inputs
())
while
True
:
server
.
step
()
if
not
server
.
has_unfinished_requests
():
break
# Start benchmarking.
logger
.
info
(
'Start benchmarking.'
)
# Initialize tqdm.
pbar
=
tqdm
(
total
=
len
(
requests
),
desc
=
'Finished requests'
)
finished
=
[]
server
.
scheduler
.
reset_stats
()
start_time
=
time
.
time
()
while
True
:
now
=
time
.
time
()
if
args
.
timeout
is
not
None
and
now
-
start_time
>
args
.
timeout
:
logger
.
info
(
'Timeout. Stop benchmarking.'
)
break
while
requests
:
if
requests
[
0
][
0
]
<=
now
-
start_time
:
request_time
,
input_tokens
,
sampling_params
=
requests
.
pop
(
0
)
frontend
.
_add_query
(
input_tokens
,
sampling_params
,
arrival_time
=
start_time
+
request_time
)
else
:
break
server
.
add_sequence_groups
(
frontend
.
get_inputs
())
updated_seq_groups
=
server
.
step
()
now
=
time
.
time
()
for
seq_group
in
updated_seq_groups
:
if
not
seq_group
.
is_finished
():
continue
arrival_time
=
seq_group
.
arrival_time
finish_time
=
now
for
seq
in
seq_group
.
get_seqs
():
seq_len
=
seq
.
get_len
()
output_len
=
seq_len
-
seq
.
prompt_len
finished
.
append
({
'group_id'
:
seq_group
.
group_id
,
'seq_id'
:
seq
.
seq_id
,
'arrival_time'
:
arrival_time
,
'finish_time'
:
finish_time
,
'prompt_len'
:
seq
.
prompt_len
,
'output_len'
:
output_len
,
})
pbar
.
update
(
1
)
if
not
(
requests
or
server
.
has_unfinished_requests
()):
break
pbar
.
close
()
logger
.
info
(
'Finish benchmarking. Saving stats.'
)
server
.
scheduler
.
save_stats
(
args
.
output_dir
)
with
open
(
os
.
path
.
join
(
args
.
output_dir
,
'sequences.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
finished
,
f
)
logger
.
info
(
'Done.'
)
def
get_model_name
(
model
:
str
)
->
str
:
OPT_MODELS
=
[
'opt-125m'
,
'opt-350m'
,
'opt-1.3b'
,
'opt-2.7b'
,
'opt-6.7b'
,
'opt-13b'
,
'opt-30b'
,
'opt-66b'
,
'opt-175b'
,
]
for
opt_model
in
OPT_MODELS
:
if
opt_model
in
model
:
return
opt_model
config
=
AutoConfig
.
from_pretrained
(
model
)
assert
config
.
model_type
==
'llama'
hidden_size
=
config
.
hidden_size
if
hidden_size
==
4096
:
return
'llama-7b'
elif
hidden_size
==
5120
:
return
'llama-13b'
elif
hidden_size
==
6656
:
return
'llama-30b'
elif
hidden_size
==
8192
:
return
'llama-65b'
else
:
raise
ValueError
(
f
'Unknown model:
{
model
}
'
)
def
get_dataset_name
(
dataset
:
str
)
->
str
:
if
'sharegpt'
in
dataset
.
lower
():
return
'sharegpt'
elif
'alpaca'
in
dataset
.
lower
():
return
'alpaca'
else
:
raise
ValueError
(
f
'Unknown dataset:
{
dataset
}
'
)
def
get_sampling_dir_name
(
n1
:
float
,
n2
:
float
,
n3
:
float
,
n4
:
float
,
n6
:
float
,
n2_beam
:
float
,
n4_beam
:
float
,
n6_beam
:
float
,
n8_beam
:
float
,
)
->
str
:
method
=
''
if
n1
>
0.0
:
method
=
'n1'
if
n1
==
1.0
else
method
+
f
'n1-
{
n1
}
-'
if
n2
>
0.0
:
method
=
'n2'
if
n2
==
1.0
else
method
+
f
'n2-
{
n2
}
-'
if
n3
>
0.0
:
method
=
'n3'
if
n3
==
1.0
else
method
+
f
'n3-
{
n3
}
-'
if
n4
>
0.0
:
method
=
'n4'
if
n4
==
1.0
else
method
+
f
'n4-
{
n4
}
-'
if
n6
>
0.0
:
method
=
'n6'
if
n6
==
1.0
else
method
+
f
'n6-
{
n6
}
-'
if
n2_beam
>
0.0
:
method
=
'n2-beam'
if
n2_beam
==
1.0
else
method
+
f
'n2-beam-
{
n2_beam
}
-'
if
n4_beam
>
0.0
:
method
=
'n4-beam'
if
n4_beam
==
1.0
else
method
+
f
'n4-beam-
{
n4_beam
}
-'
if
n6_beam
>
0.0
:
method
=
'n6-beam'
if
n6_beam
==
1.0
else
method
+
f
'n6-beam-
{
n6_beam
}
-'
if
n8_beam
>
0.0
:
method
=
'n8-beam'
if
n8_beam
==
1.0
else
method
+
f
'n8-beam-
{
n8_beam
}
-'
return
method
[:
-
1
]
if
method
.
endswith
(
'-'
)
else
method
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
add_server_arguments
(
parser
)
parser
.
add_argument
(
'--output-dir'
,
type
=
str
,
help
=
'path to output directory'
,
default
=
None
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'path to dataset'
,
required
=
True
)
parser
.
add_argument
(
'--request-rate'
,
type
=
float
,
help
=
'reqs/sec'
,
required
=
True
)
parser
.
add_argument
(
'--duration'
,
type
=
int
,
help
=
'duration in seconds'
,
required
=
True
)
parser
.
add_argument
(
'--do-memory-analysis'
,
action
=
'store_true'
,
help
=
'do memory analysis (This will lower the throughput. Use this only for analysis.)'
)
parser
.
add_argument
(
'--timeout'
,
type
=
int
,
help
=
'time out in seconds'
,
default
=
None
)
parser
.
add_argument
(
'--n1'
,
type
=
float
,
help
=
'ratio of requests with n=1'
,
default
=
0.0
)
parser
.
add_argument
(
'--n2'
,
type
=
float
,
help
=
'ratio of requests with n=2'
,
default
=
0.0
)
parser
.
add_argument
(
'--n3'
,
type
=
float
,
help
=
'ratio of requests with n=3'
,
default
=
0.0
)
parser
.
add_argument
(
'--n4'
,
type
=
float
,
help
=
'ratio of requests with n=4'
,
default
=
0.0
)
parser
.
add_argument
(
'--n6'
,
type
=
float
,
help
=
'ratio of requests with n=6'
,
default
=
0.0
)
parser
.
add_argument
(
'--n2-beam'
,
type
=
float
,
help
=
'ratio of requests with n=2 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n4-beam'
,
type
=
float
,
help
=
'ratio of requests with n=4 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n6-beam'
,
type
=
float
,
help
=
'ratio of requests with n=6 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n8-beam'
,
type
=
float
,
help
=
'ratio of requests with n=8 & beam search'
,
default
=
0.0
)
args
=
parser
.
parse_args
()
if
args
.
n1
+
args
.
n2
+
args
.
n3
+
args
.
n4
+
args
.
n6
+
args
.
n2_beam
+
args
.
n4_beam
+
args
.
n6_beam
+
args
.
n8_beam
!=
1.0
:
raise
ValueError
(
'The ratios of requests must sum to 1.'
)
model_name
=
get_model_name
(
args
.
model
)
dataset_name
=
get_dataset_name
(
args
.
dataset
)
if
'opt'
in
model_name
:
if
'opt'
not
in
args
.
dataset
.
lower
():
raise
ValueError
(
f
'OPT models can only be used with OPT datasets.'
)
elif
'llama'
in
model_name
:
if
'llama'
not
in
args
.
dataset
.
lower
():
raise
ValueError
(
f
'Llama models can only be used with Llama datasets.'
)
dataset_name
=
'sharegpt'
if
'sharegpt'
in
args
.
dataset
else
'alpaca'
sample_dir
=
get_sampling_dir_name
(
args
.
n1
,
args
.
n2
,
args
.
n3
,
args
.
n4
,
args
.
n6
,
args
.
n2_beam
,
args
.
n4_beam
,
args
.
n6_beam
,
args
.
n8_beam
)
if
args
.
output_dir
is
None
:
args
.
output_dir
=
os
.
path
.
join
(
'../exp'
,
dataset_name
,
f
'
{
model_name
}
-tp
{
args
.
tensor_parallel_size
}
'
,
sample_dir
,
'cacheflow'
,
f
'req-rate-
{
args
.
request_rate
}
'
,
f
'seed
{
args
.
seed
}
'
,
f
'duration-
{
args
.
duration
}
'
,
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
# Set up logging.
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
handlers
=
[
logging
.
StreamHandler
(),
logging
.
FileHandler
(
os
.
path
.
join
(
args
.
output_dir
,
'log.txt'
)),
],
)
logger
.
info
(
args
)
main
(
args
)
benchmark/trace.py
0 → 100644
View file @
84eee24e
import
pickle
import
random
from
typing
import
List
,
Tuple
import
numpy
as
np
from
cacheflow.sampling_params
import
SamplingParams
def
generate_text_completion_requests
(
dataset
:
str
,
request_rate
:
float
,
duration
:
int
,
seed
:
int
,
n1
:
float
=
0.0
,
n2
:
float
=
0.0
,
n3
:
float
=
0.0
,
n4
:
float
=
0.0
,
n6
:
float
=
0.0
,
n2_beam
:
float
=
0.0
,
n4_beam
:
float
=
0.0
,
n6_beam
:
float
=
0.0
,
n8_beam
:
float
=
0.0
,
max_seq_len
:
int
=
2048
,
time_quantum
:
int
=
10
,
)
->
List
[
Tuple
[
float
,
List
[
int
],
SamplingParams
]]:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
# Generate timestamps for requests using Poisson distribution.
lam
=
request_rate
*
(
time_quantum
/
1000
)
quantums_per_sec
=
1000
/
time_quantum
arrival_times
=
np
.
random
.
poisson
(
lam
=
lam
,
size
=
int
(
duration
*
quantums_per_sec
))
timestamps
=
[]
for
i
,
n
in
enumerate
(
arrival_times
):
timestamps
+=
[
i
*
(
time_quantum
/
1000
)]
*
n
# Load and shuffle the dataset.
num_requests
=
len
(
timestamps
)
with
open
(
dataset
,
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
filtered
=
[]
for
pair
in
data
:
input_tokens
,
output_tokens
=
pair
input_len
=
len
(
input_tokens
)
output_len
=
len
(
output_tokens
)
# Filter out too long sequences.
if
input_len
+
output_len
<
max_seq_len
:
# Output tokens are not needed for the benchmark.
filtered
.
append
((
input_tokens
,
output_len
))
data
=
[]
while
len
(
data
)
<
num_requests
:
data
+=
filtered
data
=
data
[:
num_requests
]
# Shuffle the data.
assert
len
(
data
)
==
len
(
timestamps
)
random
.
shuffle
(
data
)
random_sampling_params_dict
=
{
'temperature'
:
1.0
,
'top_p'
:
1.0
,
'use_beam_search'
:
False
,
'stop_token_ids'
:
set
(),
'num_logprobs'
:
0
,
'context_window_size'
:
None
,
}
beam_search_params_dict
=
{
'temperature'
:
0.0
,
'top_p'
:
1.0
,
'use_beam_search'
:
True
,
'stop_token_ids'
:
set
(),
'num_logprobs'
:
0
,
'context_window_size'
:
None
,
}
# Generate requests based on the sampling parameter ratio.
requests
=
[]
assert
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
+
n8_beam
==
1.0
cum_sum
=
0
for
timestamp
,
pair
in
zip
(
timestamps
,
data
):
input_tokens
,
output_len
=
pair
if
cum_sum
<
n1
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
1
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
2
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
3
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
4
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
6
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
2
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
4
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
6
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
+
n8_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
8
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
else
:
raise
ValueError
(
'Invalid request ratio.'
)
cum_sum
+=
1
requests
.
append
((
timestamp
,
input_tokens
,
sampling_params
))
return
requests
cacheflow/master/block_manager.py
View file @
84eee24e
...
...
@@ -241,3 +241,9 @@ class BlockSpaceManager:
def
get_block_table
(
self
,
seq
:
Sequence
)
->
List
[
int
]:
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
return
[
block
.
block_number
for
block
in
block_table
]
def
get_num_free_gpu_blocks
(
self
)
->
int
:
return
self
.
gpu_allocator
.
get_num_free_blocks
()
def
get_num_free_cpu_blocks
(
self
)
->
int
:
return
self
.
cpu_allocator
.
get_num_free_blocks
()
cacheflow/master/scheduler.py
View file @
84eee24e
import
enum
import
os
import
pickle
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.master.block_manager
import
BlockSpaceManager
from
cacheflow.master.policy
import
PolicyFactory
...
...
@@ -34,12 +36,18 @@ class Scheduler:
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
collect_stats
:
bool
,
do_memory_analysis
:
bool
=
False
,
)
->
None
:
self
.
controllers
=
controllers
self
.
block_size
=
block_size
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_sequences
=
max_num_sequences
self
.
collect_stats
=
collect_stats
self
.
do_memory_analysis
=
do_memory_analysis
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
...
...
@@ -61,6 +69,9 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
# Performance-related statistics.
self
.
stats
=
Stats
(
num_gpu_blocks
,
num_cpu_blocks
)
def
add_sequence_groups
(
self
,
seq_groups
:
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]],
...
...
@@ -123,6 +134,12 @@ class Scheduler:
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
SWAPPED
)
if
len
(
self
.
running
)
+
num_seqs
>
self
.
max_num_sequences
:
break
seq_group
=
self
.
swapped
.
pop
(
0
)
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_append
(
seq_group
,
blocks_to_copy
)
...
...
@@ -156,12 +173,68 @@ class Scheduler:
>
self
.
max_num_batched_tokens
):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
WAITING
)
if
len
(
self
.
running
)
+
num_seqs
>
self
.
max_num_sequences
:
break
seq_group
=
self
.
waiting
.
pop
(
0
)
self
.
_allocate
(
seq_group
)
self
.
running
.
append
(
seq_group
)
num_batched_tokens
+=
num_prompt_tokens
prompt_group_ids
.
append
(
seq_group
.
group_id
)
if
self
.
collect_stats
:
if
self
.
running
or
blocks_to_swap_in
or
blocks_to_swap_out
:
self
.
stats
.
timestamps
.
append
(
now
-
self
.
stats
.
start_time
)
self
.
stats
.
input_lens
.
append
(
num_batched_tokens
)
self
.
stats
.
swap_out_lens
.
append
(
len
(
blocks_to_swap_out
)
*
self
.
block_size
)
self
.
stats
.
swap_in_lens
.
append
(
len
(
blocks_to_swap_in
)
*
self
.
block_size
)
self
.
stats
.
num_preemption
.
append
(
len
(
preempted
))
self
.
stats
.
num_swapped
.
append
(
len
(
self
.
swapped
))
self
.
stats
.
num_running
.
append
(
len
(
self
.
running
))
self
.
stats
.
num_waiting
.
append
(
len
(
self
.
waiting
))
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
self
.
stats
.
gpu_cache_usage
.
append
(
num_used_gpu_blocks
/
self
.
num_gpu_blocks
)
num_free_cpu_blocks
=
self
.
block_manager
.
get_num_free_cpu_blocks
()
num_used_cpu_blocks
=
self
.
num_cpu_blocks
-
num_free_cpu_blocks
self
.
stats
.
cpu_cache_usage
.
append
(
num_used_cpu_blocks
/
self
.
num_cpu_blocks
)
if
self
.
do_memory_analysis
:
block_tables
=
self
.
block_manager
.
block_tables
num_logical_blocks
=
0
num_logical_tokens
=
0
num_physical_blocks
=
0
num_physical_tokens
=
0
physical_block_numbers
=
set
()
num_reserved_tokens
=
0
for
seq_group
in
self
.
running
:
group_id
=
seq_group
.
group_id
sampling_params
=
self
.
sampling_params
[
group_id
]
max_num_steps
=
sampling_params
.
max_num_steps
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
num_logical_blocks
+=
len
(
seq
.
logical_token_blocks
)
num_logical_tokens
+=
seq
.
get_len
()
seq_id
=
seq
.
seq_id
block_table
=
block_tables
[
seq_id
]
for
i
,
block
in
enumerate
(
block_table
):
if
block
.
block_number
in
physical_block_numbers
:
continue
physical_block_numbers
.
add
(
block
.
block_number
)
num_physical_blocks
+=
1
num_physical_tokens
+=
seq
.
logical_token_blocks
[
i
].
num_tokens
assert
num_physical_blocks
==
num_used_gpu_blocks
self
.
stats
.
num_logical_blocks
.
append
(
num_logical_blocks
)
self
.
stats
.
num_logical_tokens
.
append
(
num_logical_tokens
)
self
.
stats
.
num_physical_blocks
.
append
(
num_physical_blocks
)
self
.
stats
.
num_physical_tokens
.
append
(
num_physical_tokens
)
self
.
stats
.
num_reserved_tokens
.
append
(
num_reserved_tokens
)
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
,
...
...
@@ -381,3 +454,75 @@ class Scheduler:
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
def
reset_stats
(
self
)
->
None
:
self
.
stats
.
reset
(
self
.
num_gpu_blocks
,
self
.
num_cpu_blocks
)
def
save_stats
(
self
,
output_dir
:
str
,
)
->
None
:
assert
self
.
collect_stats
,
'Statistics collection is disabled.'
self
.
stats
.
save
(
output_dir
)
class
Stats
:
def
__init__
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
start_time
:
float
=
time
.
time
()
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
timestamps
:
List
[
float
]
=
[]
self
.
input_lens
:
List
[
int
]
=
[]
self
.
swap_out_lens
:
List
[
int
]
=
[]
self
.
swap_in_lens
:
List
[
int
]
=
[]
self
.
num_preemption
:
List
[
int
]
=
[]
self
.
num_waiting
:
List
[
int
]
=
[]
self
.
num_running
:
List
[
int
]
=
[]
self
.
num_swapped
:
List
[
int
]
=
[]
self
.
gpu_cache_usage
:
List
[
float
]
=
[]
self
.
cpu_cache_usage
:
List
[
float
]
=
[]
self
.
num_logical_blocks
:
List
[
int
]
=
[]
self
.
num_logical_tokens
:
List
[
int
]
=
[]
self
.
num_physical_blocks
:
List
[
int
]
=
[]
self
.
num_physical_tokens
:
List
[
int
]
=
[]
self
.
num_reserved_tokens
:
List
[
int
]
=
[]
def
reset
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
__init__
(
num_gpu_blocks
,
num_cpu_blocks
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
return
{
'start_time'
:
self
.
start_time
,
'num_gpu_blocks'
:
self
.
num_gpu_blocks
,
'num_cpu_blocks'
:
self
.
num_cpu_blocks
,
'timestamps'
:
self
.
timestamps
,
'input_lens'
:
self
.
input_lens
,
'swap_out_lens'
:
self
.
swap_out_lens
,
'swap_in_lens'
:
self
.
swap_in_lens
,
'num_preemption'
:
self
.
num_preemption
,
'num_waiting'
:
self
.
num_waiting
,
'num_running'
:
self
.
num_running
,
'num_swapped'
:
self
.
num_swapped
,
'gpu_cache_usage'
:
self
.
gpu_cache_usage
,
'cpu_cache_usage'
:
self
.
cpu_cache_usage
,
'num_logical_blocks'
:
self
.
num_logical_blocks
,
'num_logical_tokens'
:
self
.
num_logical_tokens
,
'num_physical_blocks'
:
self
.
num_physical_blocks
,
'num_physical_tokens'
:
self
.
num_physical_tokens
,
'num_reserved_tokens'
:
self
.
num_reserved_tokens
,
}
def
save
(
self
,
output_dir
:
str
)
->
None
:
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
self
.
to_dict
(),
f
)
cacheflow/master/server.py
View file @
84eee24e
...
...
@@ -24,12 +24,15 @@ class Server:
seed
:
int
,
swap_space
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
num_nodes
:
int
,
num_devices_per_node
:
int
,
distributed_init_method
:
str
,
all_stage_devices
:
List
[
List
[
DeviceID
]],
gpu_memory
:
int
,
cpu_memory
:
int
,
collect_stats
:
bool
=
False
,
do_memory_analysis
:
bool
=
False
,
):
self
.
num_nodes
=
num_nodes
self
.
num_devices_per_node
=
num_devices_per_node
...
...
@@ -79,6 +82,9 @@ class Server:
num_gpu_blocks
=
self
.
num_gpu_blocks
,
num_cpu_blocks
=
self
.
num_cpu_blocks
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_sequences
=
max_num_sequences
,
collect_stats
=
collect_stats
,
do_memory_analysis
=
do_memory_analysis
,
)
# Connect the controllers.
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
...
...
@@ -180,6 +186,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--use-dummy-weights'
,
action
=
'store_true'
,
help
=
'use dummy values for model weights'
)
return
parser
cacheflow/master/simple_frontend.py
View file @
84eee24e
...
...
@@ -39,7 +39,9 @@ class SimpleFrontend:
self
,
token_ids
:
List
[
int
],
sampling_params
:
SamplingParams
,
arrival_time
:
Optional
[
float
]
=
None
,
)
->
None
:
if
arrival_time
is
None
:
arrival_time
=
time
.
time
()
seqs
:
List
[
Sequence
]
=
[]
for
_
in
range
(
sampling_params
.
n
):
...
...
cacheflow/sequence.py
View file @
84eee24e
...
...
@@ -28,6 +28,7 @@ class Sequence:
# Initialize the logical token blocks with the given token ids.
self
.
add
(
token_ids
)
self
.
prompt_len
=
len
(
token_ids
)
self
.
status
=
SequenceStatus
.
WAITING
self
.
output_logprobs
:
List
[
Dict
[
int
,
float
]]
=
[]
self
.
cumulative_logprobs
=
0.0
...
...
plot/plot_normalized_latency.py
0 → 100644
View file @
84eee24e
import
argparse
import
os
import
pickle
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
matplotlib.pyplot
as
plt
import
numpy
as
np
SYSTEMS
=
[
'orca-constant'
,
'orca-power2'
,
'orca-oracle'
,
'cacheflow'
,
]
SYSTEM_TO_LABEL
=
{
'orca-constant'
:
'Orca (Max)'
,
'orca-power2'
:
'Orca (Next power of 2)'
,
'orca-oracle'
:
'Orca (Oracle)'
,
'cacheflow'
:
'CacheFlow'
,
}
SYSTEM_TO_COLOR
=
{
'orca-constant'
:
'red'
,
'orca-power2'
:
'orange'
,
'orca-oracle'
:
'green'
,
'cacheflow'
:
'blue'
,
}
SYSTEM_TO_MARKER
=
{
'orca-constant'
:
'x'
,
'orca-power2'
:
'^'
,
'orca-oracle'
:
's'
,
'cacheflow'
:
'o'
,
}
def
get_results
(
save_dir
:
str
)
->
List
[
Dict
[
str
,
Any
]]:
with
open
(
os
.
path
.
join
(
save_dir
,
'sequences.pkl'
),
'rb'
)
as
f
:
results
=
pickle
.
load
(
f
)
return
results
def
get_request_rate
(
save_dir
:
str
)
->
float
:
"""Get request rate from save_dir name."""
# Directory name format:
# .../req-rate-{req_rate}/seed-{seed}/duration-{duration}
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
request_rate
=
None
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'req-rate-'
):
if
request_rate
is
not
None
:
raise
ValueError
(
f
'Found multiple request rates in
{
save_dir
}
'
)
request_rate
=
float
(
dir_name
.
split
(
'-'
)[
-
1
])
if
request_rate
is
None
:
raise
ValueError
(
f
'Cannot find request rate in
{
save_dir
}
'
)
return
request_rate
def
get_model
(
save_dir
:
str
)
->
Tuple
[
str
,
int
]:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
model
=
None
for
dir_name
in
dir_names
:
if
'-tp'
in
dir_name
:
if
model
is
not
None
:
raise
ValueError
(
f
'Found multiple models in
{
save_dir
}
'
)
model
=
dir_name
.
split
(
'-tp'
)[
0
]
tp
=
int
(
dir_name
.
split
(
'-tp'
)[
-
1
])
if
model
is
None
:
raise
ValueError
(
f
'Cannot find model in
{
save_dir
}
'
)
return
model
,
tp
def
get_system
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'orca-'
):
return
dir_name
if
dir_name
==
'cacheflow'
:
return
dir_name
raise
ValueError
(
f
'Cannot find system in
{
save_dir
}
'
)
def
get_sampling
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'n'
):
if
dir_name
.
endswith
(
'-beam'
):
return
dir_name
if
dir_name
[
1
:].
isdigit
():
return
dir_name
raise
ValueError
(
f
'Cannot find sampling method in
{
save_dir
}
'
)
def
plot_normalized_latency
(
exp_dir
:
str
,
duration
:
int
,
seed
:
int
,
warmup
:
int
,
xlim
:
Optional
[
float
],
ylim
:
Optional
[
float
],
log_scale
:
bool
,
format
:
str
,
)
->
None
:
# Get leaf directories.
save_dirs
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
exp_dir
):
if
dirs
:
continue
if
'sequences.pkl'
not
in
files
:
continue
if
f
'seed
{
seed
}
'
not
in
root
:
continue
if
f
'duration-
{
duration
}
'
not
in
root
:
continue
save_dirs
.
append
(
root
)
# Plot normalized latency.
perf_per_system
:
Dict
[
str
,
Tuple
[
List
[
float
],
List
[
float
]]]
=
{}
for
save_dir
in
save_dirs
:
per_seq_norm_latencies
=
[]
results
=
get_results
(
save_dir
)
for
seq
in
results
:
arrival_time
=
seq
[
'arrival_time'
]
finish_time
=
seq
[
'finish_time'
]
output_len
=
seq
[
'output_len'
]
if
arrival_time
<
warmup
:
continue
latency
=
finish_time
-
arrival_time
norm_latency
=
latency
/
output_len
per_seq_norm_latencies
.
append
(
norm_latency
)
request_rate
=
get_request_rate
(
save_dir
)
normalized_latency
=
np
.
mean
(
per_seq_norm_latencies
)
system_name
=
get_system
(
save_dir
)
if
system_name
not
in
perf_per_system
:
perf_per_system
[
system_name
]
=
([],
[])
perf_per_system
[
system_name
][
0
].
append
(
request_rate
)
perf_per_system
[
system_name
][
1
].
append
(
normalized_latency
)
print
(
'#seqs'
,
len
(
per_seq_norm_latencies
))
print
(
f
'
{
save_dir
}
:
{
normalized_latency
:.
3
f
}
s'
)
# Plot normalized latency.
plt
.
figure
(
figsize
=
(
6
,
4
))
for
system_name
in
reversed
(
SYSTEMS
):
if
system_name
not
in
perf_per_system
:
continue
# Sort by request rate.
request_rates
,
normalized_latencies
=
perf_per_system
[
system_name
]
request_rates
,
normalized_latencies
=
zip
(
*
sorted
(
zip
(
request_rates
,
normalized_latencies
)))
label
=
SYSTEM_TO_LABEL
[
system_name
]
color
=
SYSTEM_TO_COLOR
[
system_name
]
marker
=
SYSTEM_TO_MARKER
[
system_name
]
plt
.
plot
(
request_rates
,
normalized_latencies
,
label
=
label
,
color
=
color
,
marker
=
marker
)
# plt.legend()
plt
.
xlabel
(
'Request rate (req/s)'
,
fontsize
=
12
)
plt
.
ylabel
(
'Normalized latency (s/token)'
,
fontsize
=
12
)
if
log_scale
:
plt
.
yscale
(
'log'
)
if
xlim
is
not
None
:
plt
.
xlim
(
left
=
0
,
right
=
xlim
)
if
ylim
is
not
None
:
if
log_scale
:
plt
.
ylim
(
top
=
ylim
)
else
:
plt
.
ylim
(
bottom
=
0
,
top
=
ylim
)
# Save figure.
model
,
tp
=
get_model
(
exp_dir
)
sampling
=
get_sampling
(
exp_dir
)
figname
=
f
'
{
model
}
-tp
{
tp
}
-
{
sampling
}
.
{
format
}
'
os
.
makedirs
(
'./figures'
,
exist_ok
=
True
)
plt
.
savefig
(
os
.
path
.
join
(
'figures'
,
figname
),
bbox_inches
=
'tight'
)
print
(
f
'Saved figure to ./figures/
{
figname
}
'
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'exp_dir'
,
type
=
str
)
parser
.
add_argument
(
'--duration'
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--warmup'
,
type
=
int
,
default
=
60
)
parser
.
add_argument
(
'--xlim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--ylim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--log'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--format'
,
choices
=
[
'png'
,
'pdf'
],
default
=
'png'
)
args
=
parser
.
parse_args
()
plot_normalized_latency
(
args
.
exp_dir
,
args
.
duration
,
args
.
seed
,
args
.
warmup
,
args
.
xlim
,
args
.
ylim
,
args
.
log
,
args
.
format
)
plot/plot_stats.py
0 → 100644
View file @
84eee24e
import
os
import
pickle
import
matplotlib.pyplot
as
plt
STAT_NAMES
=
[
'input_lens'
,
'num_running'
,
'num_waiting'
,
'num_preemption'
,
'gpu_cache_usage'
,
'cpu_cache_usage'
,
'num_swapped'
,
'swap_in_lens'
,
'swap_out_lens'
,
]
def
plot_stats
(
output_dir
:
str
):
# Get stats.
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'rb'
)
as
f
:
stats
=
pickle
.
load
(
f
)
timestamps
=
stats
[
'timestamps'
]
# Draw one figure for each stat.
num_stats
=
len
(
STAT_NAMES
)
COLORS
=
[
'b'
,
'g'
,
'r'
,
'c'
,
'm'
,
'y'
,
'k'
,
'orange'
,
'purple'
,
'pink'
,
'brown'
,
'gray'
]
fig
,
axs
=
plt
.
subplots
(
num_stats
,
1
,
figsize
=
(
10
,
2
*
num_stats
))
for
i
,
stat
in
enumerate
(
STAT_NAMES
):
data
=
stats
[
stat
]
if
stat
in
[
'gpu_cache_usage'
,
'cpu_cache_usage'
]:
data
=
[
x
*
100
for
x
in
data
]
stat
=
stat
+
' (%)'
axs
[
i
].
plot
(
timestamps
,
data
,
color
=
COLORS
[
i
%
len
(
COLORS
)])
axs
[
i
].
set_ylabel
(
stat
.
replace
(
'_'
,
' '
),
fontdict
=
{
'fontsize'
:
12
})
axs
[
i
].
set_ylim
(
bottom
=
0
)
plt
.
xlabel
(
'Time (s)'
)
plt
.
tight_layout
()
fig_path
=
os
.
path
.
join
(
output_dir
,
'stats.png'
)
plt
.
savefig
(
fig_path
)
print
(
f
'Saved stats to
{
fig_path
}
'
)
if
__name__
==
'__main__'
:
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'output_dir'
,
type
=
str
,
help
=
'Output directory.'
)
args
=
parser
.
parse_args
()
plot_stats
(
args
.
output_dir
)
simple_server.py
View file @
84eee24e
...
...
@@ -30,6 +30,7 @@ def main(args: argparse.Namespace):
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment