Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
84eee24e
Unverified
Commit
84eee24e
authored
Apr 12, 2023
by
Woosuk Kwon
Committed by
GitHub
Apr 12, 2023
Browse files
Collect system stats in scheduler & Add scripts for experiments (#30)
parent
e3cec88a
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
830 additions
and
3 deletions
+830
-3
.gitignore
.gitignore
+4
-0
benchmark/benchmark_latency.py
benchmark/benchmark_latency.py
+1
-0
benchmark/benchmark_text_completion.py
benchmark/benchmark_text_completion.py
+289
-0
benchmark/trace.py
benchmark/trace.py
+116
-0
cacheflow/master/block_manager.py
cacheflow/master/block_manager.py
+6
-0
cacheflow/master/scheduler.py
cacheflow/master/scheduler.py
+146
-1
cacheflow/master/server.py
cacheflow/master/server.py
+8
-1
cacheflow/master/simple_frontend.py
cacheflow/master/simple_frontend.py
+3
-1
cacheflow/sequence.py
cacheflow/sequence.py
+1
-0
plot/plot_normalized_latency.py
plot/plot_normalized_latency.py
+203
-0
plot/plot_stats.py
plot/plot_stats.py
+52
-0
simple_server.py
simple_server.py
+1
-0
No files found.
.gitignore
View file @
84eee24e
...
@@ -4,3 +4,7 @@
...
@@ -4,3 +4,7 @@
*.eggs/
*.eggs/
*.so
*.so
build/
build/
*.pkl
*.png
**/log.txt
benchmark/benchmark_latency.py
View file @
84eee24e
...
@@ -37,6 +37,7 @@ def main(args: argparse.Namespace):
...
@@ -37,6 +37,7 @@ def main(args: argparse.Namespace):
seed
=
args
.
seed
,
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
...
...
benchmark/benchmark_text_completion.py
0 → 100644
View file @
84eee24e
import
argparse
import
logging
import
os
import
pickle
import
time
from
typing
import
List
from
tqdm
import
tqdm
from
transformers
import
AutoConfig
from
benchmark.trace
import
generate_text_completion_requests
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
logger
=
logging
.
getLogger
(
__name__
)
def
main
(
args
:
argparse
.
Namespace
):
assert
args
.
pipeline_parallel_size
==
1
,
(
'Pipeline parallelism is not supported yet.'
)
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
initialize_ray_cluster
(
address
=
'local'
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
# Create a server.
server
=
Server
(
model
=
args
.
model
,
model_path
=
args
.
model_path
,
use_dummy_weights
=
args
.
use_dummy_weights
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
block_size
=
args
.
block_size
,
dtype
=
args
.
dtype
,
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
collect_stats
=
True
,
do_memory_analysis
=
args
.
do_memory_analysis
,
)
# Create a frontend.
frontend
=
SimpleFrontend
(
model_name
=
args
.
model
,
block_size
=
args
.
block_size
,
)
# Generate requests.
requests
=
generate_text_completion_requests
(
args
.
dataset
,
args
.
request_rate
,
args
.
duration
,
args
.
seed
,
args
.
n1
,
args
.
n2
,
args
.
n3
,
args
.
n4
,
args
.
n6
,
args
.
n2_beam
,
args
.
n4_beam
,
args
.
n6_beam
,
args
.
n8_beam
,
)
# Warm up.
logger
.
info
(
'Warming up.'
)
num_warmup_requests
=
8
warmup_input_len
=
8
warmup_output_len
=
32
warmup_sampling_params
=
SamplingParams
(
n
=
1
,
temperature
=
1.0
,
top_p
=
0.99
,
max_num_steps
=
warmup_output_len
,
use_beam_search
=
False
,
stop_token_ids
=
set
(),
num_logprobs
=
0
,
context_window_size
=
None
,
)
for
_
in
range
(
num_warmup_requests
):
frontend
.
_add_query
([
0
]
*
warmup_input_len
,
warmup_sampling_params
)
server
.
add_sequence_groups
(
frontend
.
get_inputs
())
while
True
:
server
.
step
()
if
not
server
.
has_unfinished_requests
():
break
# Start benchmarking.
logger
.
info
(
'Start benchmarking.'
)
# Initialize tqdm.
pbar
=
tqdm
(
total
=
len
(
requests
),
desc
=
'Finished requests'
)
finished
=
[]
server
.
scheduler
.
reset_stats
()
start_time
=
time
.
time
()
while
True
:
now
=
time
.
time
()
if
args
.
timeout
is
not
None
and
now
-
start_time
>
args
.
timeout
:
logger
.
info
(
'Timeout. Stop benchmarking.'
)
break
while
requests
:
if
requests
[
0
][
0
]
<=
now
-
start_time
:
request_time
,
input_tokens
,
sampling_params
=
requests
.
pop
(
0
)
frontend
.
_add_query
(
input_tokens
,
sampling_params
,
arrival_time
=
start_time
+
request_time
)
else
:
break
server
.
add_sequence_groups
(
frontend
.
get_inputs
())
updated_seq_groups
=
server
.
step
()
now
=
time
.
time
()
for
seq_group
in
updated_seq_groups
:
if
not
seq_group
.
is_finished
():
continue
arrival_time
=
seq_group
.
arrival_time
finish_time
=
now
for
seq
in
seq_group
.
get_seqs
():
seq_len
=
seq
.
get_len
()
output_len
=
seq_len
-
seq
.
prompt_len
finished
.
append
({
'group_id'
:
seq_group
.
group_id
,
'seq_id'
:
seq
.
seq_id
,
'arrival_time'
:
arrival_time
,
'finish_time'
:
finish_time
,
'prompt_len'
:
seq
.
prompt_len
,
'output_len'
:
output_len
,
})
pbar
.
update
(
1
)
if
not
(
requests
or
server
.
has_unfinished_requests
()):
break
pbar
.
close
()
logger
.
info
(
'Finish benchmarking. Saving stats.'
)
server
.
scheduler
.
save_stats
(
args
.
output_dir
)
with
open
(
os
.
path
.
join
(
args
.
output_dir
,
'sequences.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
finished
,
f
)
logger
.
info
(
'Done.'
)
def
get_model_name
(
model
:
str
)
->
str
:
OPT_MODELS
=
[
'opt-125m'
,
'opt-350m'
,
'opt-1.3b'
,
'opt-2.7b'
,
'opt-6.7b'
,
'opt-13b'
,
'opt-30b'
,
'opt-66b'
,
'opt-175b'
,
]
for
opt_model
in
OPT_MODELS
:
if
opt_model
in
model
:
return
opt_model
config
=
AutoConfig
.
from_pretrained
(
model
)
assert
config
.
model_type
==
'llama'
hidden_size
=
config
.
hidden_size
if
hidden_size
==
4096
:
return
'llama-7b'
elif
hidden_size
==
5120
:
return
'llama-13b'
elif
hidden_size
==
6656
:
return
'llama-30b'
elif
hidden_size
==
8192
:
return
'llama-65b'
else
:
raise
ValueError
(
f
'Unknown model:
{
model
}
'
)
def
get_dataset_name
(
dataset
:
str
)
->
str
:
if
'sharegpt'
in
dataset
.
lower
():
return
'sharegpt'
elif
'alpaca'
in
dataset
.
lower
():
return
'alpaca'
else
:
raise
ValueError
(
f
'Unknown dataset:
{
dataset
}
'
)
def
get_sampling_dir_name
(
n1
:
float
,
n2
:
float
,
n3
:
float
,
n4
:
float
,
n6
:
float
,
n2_beam
:
float
,
n4_beam
:
float
,
n6_beam
:
float
,
n8_beam
:
float
,
)
->
str
:
method
=
''
if
n1
>
0.0
:
method
=
'n1'
if
n1
==
1.0
else
method
+
f
'n1-
{
n1
}
-'
if
n2
>
0.0
:
method
=
'n2'
if
n2
==
1.0
else
method
+
f
'n2-
{
n2
}
-'
if
n3
>
0.0
:
method
=
'n3'
if
n3
==
1.0
else
method
+
f
'n3-
{
n3
}
-'
if
n4
>
0.0
:
method
=
'n4'
if
n4
==
1.0
else
method
+
f
'n4-
{
n4
}
-'
if
n6
>
0.0
:
method
=
'n6'
if
n6
==
1.0
else
method
+
f
'n6-
{
n6
}
-'
if
n2_beam
>
0.0
:
method
=
'n2-beam'
if
n2_beam
==
1.0
else
method
+
f
'n2-beam-
{
n2_beam
}
-'
if
n4_beam
>
0.0
:
method
=
'n4-beam'
if
n4_beam
==
1.0
else
method
+
f
'n4-beam-
{
n4_beam
}
-'
if
n6_beam
>
0.0
:
method
=
'n6-beam'
if
n6_beam
==
1.0
else
method
+
f
'n6-beam-
{
n6_beam
}
-'
if
n8_beam
>
0.0
:
method
=
'n8-beam'
if
n8_beam
==
1.0
else
method
+
f
'n8-beam-
{
n8_beam
}
-'
return
method
[:
-
1
]
if
method
.
endswith
(
'-'
)
else
method
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
add_server_arguments
(
parser
)
parser
.
add_argument
(
'--output-dir'
,
type
=
str
,
help
=
'path to output directory'
,
default
=
None
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'path to dataset'
,
required
=
True
)
parser
.
add_argument
(
'--request-rate'
,
type
=
float
,
help
=
'reqs/sec'
,
required
=
True
)
parser
.
add_argument
(
'--duration'
,
type
=
int
,
help
=
'duration in seconds'
,
required
=
True
)
parser
.
add_argument
(
'--do-memory-analysis'
,
action
=
'store_true'
,
help
=
'do memory analysis (This will lower the throughput. Use this only for analysis.)'
)
parser
.
add_argument
(
'--timeout'
,
type
=
int
,
help
=
'time out in seconds'
,
default
=
None
)
parser
.
add_argument
(
'--n1'
,
type
=
float
,
help
=
'ratio of requests with n=1'
,
default
=
0.0
)
parser
.
add_argument
(
'--n2'
,
type
=
float
,
help
=
'ratio of requests with n=2'
,
default
=
0.0
)
parser
.
add_argument
(
'--n3'
,
type
=
float
,
help
=
'ratio of requests with n=3'
,
default
=
0.0
)
parser
.
add_argument
(
'--n4'
,
type
=
float
,
help
=
'ratio of requests with n=4'
,
default
=
0.0
)
parser
.
add_argument
(
'--n6'
,
type
=
float
,
help
=
'ratio of requests with n=6'
,
default
=
0.0
)
parser
.
add_argument
(
'--n2-beam'
,
type
=
float
,
help
=
'ratio of requests with n=2 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n4-beam'
,
type
=
float
,
help
=
'ratio of requests with n=4 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n6-beam'
,
type
=
float
,
help
=
'ratio of requests with n=6 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n8-beam'
,
type
=
float
,
help
=
'ratio of requests with n=8 & beam search'
,
default
=
0.0
)
args
=
parser
.
parse_args
()
if
args
.
n1
+
args
.
n2
+
args
.
n3
+
args
.
n4
+
args
.
n6
+
args
.
n2_beam
+
args
.
n4_beam
+
args
.
n6_beam
+
args
.
n8_beam
!=
1.0
:
raise
ValueError
(
'The ratios of requests must sum to 1.'
)
model_name
=
get_model_name
(
args
.
model
)
dataset_name
=
get_dataset_name
(
args
.
dataset
)
if
'opt'
in
model_name
:
if
'opt'
not
in
args
.
dataset
.
lower
():
raise
ValueError
(
f
'OPT models can only be used with OPT datasets.'
)
elif
'llama'
in
model_name
:
if
'llama'
not
in
args
.
dataset
.
lower
():
raise
ValueError
(
f
'Llama models can only be used with Llama datasets.'
)
dataset_name
=
'sharegpt'
if
'sharegpt'
in
args
.
dataset
else
'alpaca'
sample_dir
=
get_sampling_dir_name
(
args
.
n1
,
args
.
n2
,
args
.
n3
,
args
.
n4
,
args
.
n6
,
args
.
n2_beam
,
args
.
n4_beam
,
args
.
n6_beam
,
args
.
n8_beam
)
if
args
.
output_dir
is
None
:
args
.
output_dir
=
os
.
path
.
join
(
'../exp'
,
dataset_name
,
f
'
{
model_name
}
-tp
{
args
.
tensor_parallel_size
}
'
,
sample_dir
,
'cacheflow'
,
f
'req-rate-
{
args
.
request_rate
}
'
,
f
'seed
{
args
.
seed
}
'
,
f
'duration-
{
args
.
duration
}
'
,
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
# Set up logging.
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
handlers
=
[
logging
.
StreamHandler
(),
logging
.
FileHandler
(
os
.
path
.
join
(
args
.
output_dir
,
'log.txt'
)),
],
)
logger
.
info
(
args
)
main
(
args
)
benchmark/trace.py
0 → 100644
View file @
84eee24e
import
pickle
import
random
from
typing
import
List
,
Tuple
import
numpy
as
np
from
cacheflow.sampling_params
import
SamplingParams
def
generate_text_completion_requests
(
dataset
:
str
,
request_rate
:
float
,
duration
:
int
,
seed
:
int
,
n1
:
float
=
0.0
,
n2
:
float
=
0.0
,
n3
:
float
=
0.0
,
n4
:
float
=
0.0
,
n6
:
float
=
0.0
,
n2_beam
:
float
=
0.0
,
n4_beam
:
float
=
0.0
,
n6_beam
:
float
=
0.0
,
n8_beam
:
float
=
0.0
,
max_seq_len
:
int
=
2048
,
time_quantum
:
int
=
10
,
)
->
List
[
Tuple
[
float
,
List
[
int
],
SamplingParams
]]:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
# Generate timestamps for requests using Poisson distribution.
lam
=
request_rate
*
(
time_quantum
/
1000
)
quantums_per_sec
=
1000
/
time_quantum
arrival_times
=
np
.
random
.
poisson
(
lam
=
lam
,
size
=
int
(
duration
*
quantums_per_sec
))
timestamps
=
[]
for
i
,
n
in
enumerate
(
arrival_times
):
timestamps
+=
[
i
*
(
time_quantum
/
1000
)]
*
n
# Load and shuffle the dataset.
num_requests
=
len
(
timestamps
)
with
open
(
dataset
,
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
filtered
=
[]
for
pair
in
data
:
input_tokens
,
output_tokens
=
pair
input_len
=
len
(
input_tokens
)
output_len
=
len
(
output_tokens
)
# Filter out too long sequences.
if
input_len
+
output_len
<
max_seq_len
:
# Output tokens are not needed for the benchmark.
filtered
.
append
((
input_tokens
,
output_len
))
data
=
[]
while
len
(
data
)
<
num_requests
:
data
+=
filtered
data
=
data
[:
num_requests
]
# Shuffle the data.
assert
len
(
data
)
==
len
(
timestamps
)
random
.
shuffle
(
data
)
random_sampling_params_dict
=
{
'temperature'
:
1.0
,
'top_p'
:
1.0
,
'use_beam_search'
:
False
,
'stop_token_ids'
:
set
(),
'num_logprobs'
:
0
,
'context_window_size'
:
None
,
}
beam_search_params_dict
=
{
'temperature'
:
0.0
,
'top_p'
:
1.0
,
'use_beam_search'
:
True
,
'stop_token_ids'
:
set
(),
'num_logprobs'
:
0
,
'context_window_size'
:
None
,
}
# Generate requests based on the sampling parameter ratio.
requests
=
[]
assert
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
+
n8_beam
==
1.0
cum_sum
=
0
for
timestamp
,
pair
in
zip
(
timestamps
,
data
):
input_tokens
,
output_len
=
pair
if
cum_sum
<
n1
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
1
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
2
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
3
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
4
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
6
,
max_num_steps
=
output_len
,
**
random_sampling_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
2
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
4
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
6
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
elif
cum_sum
<
(
n1
+
n2
+
n3
+
n4
+
n6
+
n2_beam
+
n4_beam
+
n6_beam
+
n8_beam
)
*
num_requests
:
sampling_params
=
SamplingParams
(
n
=
8
,
max_num_steps
=
output_len
,
**
beam_search_params_dict
)
else
:
raise
ValueError
(
'Invalid request ratio.'
)
cum_sum
+=
1
requests
.
append
((
timestamp
,
input_tokens
,
sampling_params
))
return
requests
cacheflow/master/block_manager.py
View file @
84eee24e
...
@@ -241,3 +241,9 @@ class BlockSpaceManager:
...
@@ -241,3 +241,9 @@ class BlockSpaceManager:
def
get_block_table
(
self
,
seq
:
Sequence
)
->
List
[
int
]:
def
get_block_table
(
self
,
seq
:
Sequence
)
->
List
[
int
]:
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
return
[
block
.
block_number
for
block
in
block_table
]
return
[
block
.
block_number
for
block
in
block_table
]
def
get_num_free_gpu_blocks
(
self
)
->
int
:
return
self
.
gpu_allocator
.
get_num_free_blocks
()
def
get_num_free_cpu_blocks
(
self
)
->
int
:
return
self
.
cpu_allocator
.
get_num_free_blocks
()
cacheflow/master/scheduler.py
View file @
84eee24e
import
enum
import
enum
import
os
import
pickle
import
time
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.master.block_manager
import
BlockSpaceManager
from
cacheflow.master.block_manager
import
BlockSpaceManager
from
cacheflow.master.policy
import
PolicyFactory
from
cacheflow.master.policy
import
PolicyFactory
...
@@ -34,12 +36,18 @@ class Scheduler:
...
@@ -34,12 +36,18 @@ class Scheduler:
num_gpu_blocks
:
int
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
num_cpu_blocks
:
int
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
collect_stats
:
bool
,
do_memory_analysis
:
bool
=
False
,
)
->
None
:
)
->
None
:
self
.
controllers
=
controllers
self
.
controllers
=
controllers
self
.
block_size
=
block_size
self
.
block_size
=
block_size
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_batched_tokens
=
max_num_batched_tokens
self
.
max_num_sequences
=
max_num_sequences
self
.
collect_stats
=
collect_stats
self
.
do_memory_analysis
=
do_memory_analysis
# Instantiate the scheduling policy.
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
'fcfs'
)
...
@@ -61,6 +69,9 @@ class Scheduler:
...
@@ -61,6 +69,9 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
# Performance-related statistics.
self
.
stats
=
Stats
(
num_gpu_blocks
,
num_cpu_blocks
)
def
add_sequence_groups
(
def
add_sequence_groups
(
self
,
self
,
seq_groups
:
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]],
seq_groups
:
List
[
Tuple
[
SequenceGroup
,
SamplingParams
]],
...
@@ -123,6 +134,12 @@ class Scheduler:
...
@@ -123,6 +134,12 @@ class Scheduler:
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
break
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
SWAPPED
)
if
len
(
self
.
running
)
+
num_seqs
>
self
.
max_num_sequences
:
break
seq_group
=
self
.
swapped
.
pop
(
0
)
seq_group
=
self
.
swapped
.
pop
(
0
)
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_append
(
seq_group
,
blocks_to_copy
)
self
.
_append
(
seq_group
,
blocks_to_copy
)
...
@@ -156,12 +173,68 @@ class Scheduler:
...
@@ -156,12 +173,68 @@ class Scheduler:
>
self
.
max_num_batched_tokens
):
>
self
.
max_num_batched_tokens
):
break
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
WAITING
)
if
len
(
self
.
running
)
+
num_seqs
>
self
.
max_num_sequences
:
break
seq_group
=
self
.
waiting
.
pop
(
0
)
seq_group
=
self
.
waiting
.
pop
(
0
)
self
.
_allocate
(
seq_group
)
self
.
_allocate
(
seq_group
)
self
.
running
.
append
(
seq_group
)
self
.
running
.
append
(
seq_group
)
num_batched_tokens
+=
num_prompt_tokens
num_batched_tokens
+=
num_prompt_tokens
prompt_group_ids
.
append
(
seq_group
.
group_id
)
prompt_group_ids
.
append
(
seq_group
.
group_id
)
if
self
.
collect_stats
:
if
self
.
running
or
blocks_to_swap_in
or
blocks_to_swap_out
:
self
.
stats
.
timestamps
.
append
(
now
-
self
.
stats
.
start_time
)
self
.
stats
.
input_lens
.
append
(
num_batched_tokens
)
self
.
stats
.
swap_out_lens
.
append
(
len
(
blocks_to_swap_out
)
*
self
.
block_size
)
self
.
stats
.
swap_in_lens
.
append
(
len
(
blocks_to_swap_in
)
*
self
.
block_size
)
self
.
stats
.
num_preemption
.
append
(
len
(
preempted
))
self
.
stats
.
num_swapped
.
append
(
len
(
self
.
swapped
))
self
.
stats
.
num_running
.
append
(
len
(
self
.
running
))
self
.
stats
.
num_waiting
.
append
(
len
(
self
.
waiting
))
num_free_gpu_blocks
=
self
.
block_manager
.
get_num_free_gpu_blocks
()
num_used_gpu_blocks
=
self
.
num_gpu_blocks
-
num_free_gpu_blocks
self
.
stats
.
gpu_cache_usage
.
append
(
num_used_gpu_blocks
/
self
.
num_gpu_blocks
)
num_free_cpu_blocks
=
self
.
block_manager
.
get_num_free_cpu_blocks
()
num_used_cpu_blocks
=
self
.
num_cpu_blocks
-
num_free_cpu_blocks
self
.
stats
.
cpu_cache_usage
.
append
(
num_used_cpu_blocks
/
self
.
num_cpu_blocks
)
if
self
.
do_memory_analysis
:
block_tables
=
self
.
block_manager
.
block_tables
num_logical_blocks
=
0
num_logical_tokens
=
0
num_physical_blocks
=
0
num_physical_tokens
=
0
physical_block_numbers
=
set
()
num_reserved_tokens
=
0
for
seq_group
in
self
.
running
:
group_id
=
seq_group
.
group_id
sampling_params
=
self
.
sampling_params
[
group_id
]
max_num_steps
=
sampling_params
.
max_num_steps
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
num_logical_blocks
+=
len
(
seq
.
logical_token_blocks
)
num_logical_tokens
+=
seq
.
get_len
()
seq_id
=
seq
.
seq_id
block_table
=
block_tables
[
seq_id
]
for
i
,
block
in
enumerate
(
block_table
):
if
block
.
block_number
in
physical_block_numbers
:
continue
physical_block_numbers
.
add
(
block
.
block_number
)
num_physical_blocks
+=
1
num_physical_tokens
+=
seq
.
logical_token_blocks
[
i
].
num_tokens
assert
num_physical_blocks
==
num_used_gpu_blocks
self
.
stats
.
num_logical_blocks
.
append
(
num_logical_blocks
)
self
.
stats
.
num_logical_tokens
.
append
(
num_logical_tokens
)
self
.
stats
.
num_physical_blocks
.
append
(
num_physical_blocks
)
self
.
stats
.
num_physical_tokens
.
append
(
num_physical_tokens
)
self
.
stats
.
num_reserved_tokens
.
append
(
num_reserved_tokens
)
return
(
blocks_to_swap_in
,
return
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_swap_out
,
blocks_to_copy
,
blocks_to_copy
,
...
@@ -381,3 +454,75 @@ class Scheduler:
...
@@ -381,3 +454,75 @@ class Scheduler:
blocks_to_swap_out
.
update
(
mapping
)
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
seq
.
status
=
SequenceStatus
.
SWAPPED
def
reset_stats
(
self
)
->
None
:
self
.
stats
.
reset
(
self
.
num_gpu_blocks
,
self
.
num_cpu_blocks
)
def
save_stats
(
self
,
output_dir
:
str
,
)
->
None
:
assert
self
.
collect_stats
,
'Statistics collection is disabled.'
self
.
stats
.
save
(
output_dir
)
class
Stats
:
def
__init__
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
start_time
:
float
=
time
.
time
()
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
timestamps
:
List
[
float
]
=
[]
self
.
input_lens
:
List
[
int
]
=
[]
self
.
swap_out_lens
:
List
[
int
]
=
[]
self
.
swap_in_lens
:
List
[
int
]
=
[]
self
.
num_preemption
:
List
[
int
]
=
[]
self
.
num_waiting
:
List
[
int
]
=
[]
self
.
num_running
:
List
[
int
]
=
[]
self
.
num_swapped
:
List
[
int
]
=
[]
self
.
gpu_cache_usage
:
List
[
float
]
=
[]
self
.
cpu_cache_usage
:
List
[
float
]
=
[]
self
.
num_logical_blocks
:
List
[
int
]
=
[]
self
.
num_logical_tokens
:
List
[
int
]
=
[]
self
.
num_physical_blocks
:
List
[
int
]
=
[]
self
.
num_physical_tokens
:
List
[
int
]
=
[]
self
.
num_reserved_tokens
:
List
[
int
]
=
[]
def
reset
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
)
->
None
:
self
.
__init__
(
num_gpu_blocks
,
num_cpu_blocks
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
return
{
'start_time'
:
self
.
start_time
,
'num_gpu_blocks'
:
self
.
num_gpu_blocks
,
'num_cpu_blocks'
:
self
.
num_cpu_blocks
,
'timestamps'
:
self
.
timestamps
,
'input_lens'
:
self
.
input_lens
,
'swap_out_lens'
:
self
.
swap_out_lens
,
'swap_in_lens'
:
self
.
swap_in_lens
,
'num_preemption'
:
self
.
num_preemption
,
'num_waiting'
:
self
.
num_waiting
,
'num_running'
:
self
.
num_running
,
'num_swapped'
:
self
.
num_swapped
,
'gpu_cache_usage'
:
self
.
gpu_cache_usage
,
'cpu_cache_usage'
:
self
.
cpu_cache_usage
,
'num_logical_blocks'
:
self
.
num_logical_blocks
,
'num_logical_tokens'
:
self
.
num_logical_tokens
,
'num_physical_blocks'
:
self
.
num_physical_blocks
,
'num_physical_tokens'
:
self
.
num_physical_tokens
,
'num_reserved_tokens'
:
self
.
num_reserved_tokens
,
}
def
save
(
self
,
output_dir
:
str
)
->
None
:
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'wb'
)
as
f
:
pickle
.
dump
(
self
.
to_dict
(),
f
)
cacheflow/master/server.py
View file @
84eee24e
...
@@ -24,12 +24,15 @@ class Server:
...
@@ -24,12 +24,15 @@ class Server:
seed
:
int
,
seed
:
int
,
swap_space
:
int
,
swap_space
:
int
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
num_nodes
:
int
,
num_nodes
:
int
,
num_devices_per_node
:
int
,
num_devices_per_node
:
int
,
distributed_init_method
:
str
,
distributed_init_method
:
str
,
all_stage_devices
:
List
[
List
[
DeviceID
]],
all_stage_devices
:
List
[
List
[
DeviceID
]],
gpu_memory
:
int
,
gpu_memory
:
int
,
cpu_memory
:
int
,
cpu_memory
:
int
,
collect_stats
:
bool
=
False
,
do_memory_analysis
:
bool
=
False
,
):
):
self
.
num_nodes
=
num_nodes
self
.
num_nodes
=
num_nodes
self
.
num_devices_per_node
=
num_devices_per_node
self
.
num_devices_per_node
=
num_devices_per_node
...
@@ -79,6 +82,9 @@ class Server:
...
@@ -79,6 +82,9 @@ class Server:
num_gpu_blocks
=
self
.
num_gpu_blocks
,
num_gpu_blocks
=
self
.
num_gpu_blocks
,
num_cpu_blocks
=
self
.
num_cpu_blocks
,
num_cpu_blocks
=
self
.
num_cpu_blocks
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_sequences
=
max_num_sequences
,
collect_stats
=
collect_stats
,
do_memory_analysis
=
do_memory_analysis
,
)
)
# Connect the controllers.
# Connect the controllers.
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
for
i
in
range
(
len
(
self
.
controllers
)
-
1
):
...
@@ -180,6 +186,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
...
@@ -180,6 +186,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
,
help
=
'random seed'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
20
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
2560
,
help
=
'maximum number of batched tokens per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--use-dummy-weights'
,
action
=
'store_true'
,
help
=
'use dummy values for model weights'
)
parser
.
add_argument
(
'--use-dummy-weights'
,
action
=
'store_true'
,
help
=
'use dummy values for model weights'
)
return
parser
return
parser
cacheflow/master/simple_frontend.py
View file @
84eee24e
...
@@ -39,7 +39,9 @@ class SimpleFrontend:
...
@@ -39,7 +39,9 @@ class SimpleFrontend:
self
,
self
,
token_ids
:
List
[
int
],
token_ids
:
List
[
int
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
arrival_time
:
Optional
[
float
]
=
None
,
)
->
None
:
)
->
None
:
if
arrival_time
is
None
:
arrival_time
=
time
.
time
()
arrival_time
=
time
.
time
()
seqs
:
List
[
Sequence
]
=
[]
seqs
:
List
[
Sequence
]
=
[]
for
_
in
range
(
sampling_params
.
n
):
for
_
in
range
(
sampling_params
.
n
):
...
...
cacheflow/sequence.py
View file @
84eee24e
...
@@ -28,6 +28,7 @@ class Sequence:
...
@@ -28,6 +28,7 @@ class Sequence:
# Initialize the logical token blocks with the given token ids.
# Initialize the logical token blocks with the given token ids.
self
.
add
(
token_ids
)
self
.
add
(
token_ids
)
self
.
prompt_len
=
len
(
token_ids
)
self
.
status
=
SequenceStatus
.
WAITING
self
.
status
=
SequenceStatus
.
WAITING
self
.
output_logprobs
:
List
[
Dict
[
int
,
float
]]
=
[]
self
.
output_logprobs
:
List
[
Dict
[
int
,
float
]]
=
[]
self
.
cumulative_logprobs
=
0.0
self
.
cumulative_logprobs
=
0.0
...
...
plot/plot_normalized_latency.py
0 → 100644
View file @
84eee24e
import
argparse
import
os
import
pickle
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
matplotlib.pyplot
as
plt
import
numpy
as
np
SYSTEMS
=
[
'orca-constant'
,
'orca-power2'
,
'orca-oracle'
,
'cacheflow'
,
]
SYSTEM_TO_LABEL
=
{
'orca-constant'
:
'Orca (Max)'
,
'orca-power2'
:
'Orca (Next power of 2)'
,
'orca-oracle'
:
'Orca (Oracle)'
,
'cacheflow'
:
'CacheFlow'
,
}
SYSTEM_TO_COLOR
=
{
'orca-constant'
:
'red'
,
'orca-power2'
:
'orange'
,
'orca-oracle'
:
'green'
,
'cacheflow'
:
'blue'
,
}
SYSTEM_TO_MARKER
=
{
'orca-constant'
:
'x'
,
'orca-power2'
:
'^'
,
'orca-oracle'
:
's'
,
'cacheflow'
:
'o'
,
}
def
get_results
(
save_dir
:
str
)
->
List
[
Dict
[
str
,
Any
]]:
with
open
(
os
.
path
.
join
(
save_dir
,
'sequences.pkl'
),
'rb'
)
as
f
:
results
=
pickle
.
load
(
f
)
return
results
def
get_request_rate
(
save_dir
:
str
)
->
float
:
"""Get request rate from save_dir name."""
# Directory name format:
# .../req-rate-{req_rate}/seed-{seed}/duration-{duration}
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
request_rate
=
None
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'req-rate-'
):
if
request_rate
is
not
None
:
raise
ValueError
(
f
'Found multiple request rates in
{
save_dir
}
'
)
request_rate
=
float
(
dir_name
.
split
(
'-'
)[
-
1
])
if
request_rate
is
None
:
raise
ValueError
(
f
'Cannot find request rate in
{
save_dir
}
'
)
return
request_rate
def
get_model
(
save_dir
:
str
)
->
Tuple
[
str
,
int
]:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
model
=
None
for
dir_name
in
dir_names
:
if
'-tp'
in
dir_name
:
if
model
is
not
None
:
raise
ValueError
(
f
'Found multiple models in
{
save_dir
}
'
)
model
=
dir_name
.
split
(
'-tp'
)[
0
]
tp
=
int
(
dir_name
.
split
(
'-tp'
)[
-
1
])
if
model
is
None
:
raise
ValueError
(
f
'Cannot find model in
{
save_dir
}
'
)
return
model
,
tp
def
get_system
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'orca-'
):
return
dir_name
if
dir_name
==
'cacheflow'
:
return
dir_name
raise
ValueError
(
f
'Cannot find system in
{
save_dir
}
'
)
def
get_sampling
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'n'
):
if
dir_name
.
endswith
(
'-beam'
):
return
dir_name
if
dir_name
[
1
:].
isdigit
():
return
dir_name
raise
ValueError
(
f
'Cannot find sampling method in
{
save_dir
}
'
)
def
plot_normalized_latency
(
exp_dir
:
str
,
duration
:
int
,
seed
:
int
,
warmup
:
int
,
xlim
:
Optional
[
float
],
ylim
:
Optional
[
float
],
log_scale
:
bool
,
format
:
str
,
)
->
None
:
# Get leaf directories.
save_dirs
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
exp_dir
):
if
dirs
:
continue
if
'sequences.pkl'
not
in
files
:
continue
if
f
'seed
{
seed
}
'
not
in
root
:
continue
if
f
'duration-
{
duration
}
'
not
in
root
:
continue
save_dirs
.
append
(
root
)
# Plot normalized latency.
perf_per_system
:
Dict
[
str
,
Tuple
[
List
[
float
],
List
[
float
]]]
=
{}
for
save_dir
in
save_dirs
:
per_seq_norm_latencies
=
[]
results
=
get_results
(
save_dir
)
for
seq
in
results
:
arrival_time
=
seq
[
'arrival_time'
]
finish_time
=
seq
[
'finish_time'
]
output_len
=
seq
[
'output_len'
]
if
arrival_time
<
warmup
:
continue
latency
=
finish_time
-
arrival_time
norm_latency
=
latency
/
output_len
per_seq_norm_latencies
.
append
(
norm_latency
)
request_rate
=
get_request_rate
(
save_dir
)
normalized_latency
=
np
.
mean
(
per_seq_norm_latencies
)
system_name
=
get_system
(
save_dir
)
if
system_name
not
in
perf_per_system
:
perf_per_system
[
system_name
]
=
([],
[])
perf_per_system
[
system_name
][
0
].
append
(
request_rate
)
perf_per_system
[
system_name
][
1
].
append
(
normalized_latency
)
print
(
'#seqs'
,
len
(
per_seq_norm_latencies
))
print
(
f
'
{
save_dir
}
:
{
normalized_latency
:.
3
f
}
s'
)
# Plot normalized latency.
plt
.
figure
(
figsize
=
(
6
,
4
))
for
system_name
in
reversed
(
SYSTEMS
):
if
system_name
not
in
perf_per_system
:
continue
# Sort by request rate.
request_rates
,
normalized_latencies
=
perf_per_system
[
system_name
]
request_rates
,
normalized_latencies
=
zip
(
*
sorted
(
zip
(
request_rates
,
normalized_latencies
)))
label
=
SYSTEM_TO_LABEL
[
system_name
]
color
=
SYSTEM_TO_COLOR
[
system_name
]
marker
=
SYSTEM_TO_MARKER
[
system_name
]
plt
.
plot
(
request_rates
,
normalized_latencies
,
label
=
label
,
color
=
color
,
marker
=
marker
)
# plt.legend()
plt
.
xlabel
(
'Request rate (req/s)'
,
fontsize
=
12
)
plt
.
ylabel
(
'Normalized latency (s/token)'
,
fontsize
=
12
)
if
log_scale
:
plt
.
yscale
(
'log'
)
if
xlim
is
not
None
:
plt
.
xlim
(
left
=
0
,
right
=
xlim
)
if
ylim
is
not
None
:
if
log_scale
:
plt
.
ylim
(
top
=
ylim
)
else
:
plt
.
ylim
(
bottom
=
0
,
top
=
ylim
)
# Save figure.
model
,
tp
=
get_model
(
exp_dir
)
sampling
=
get_sampling
(
exp_dir
)
figname
=
f
'
{
model
}
-tp
{
tp
}
-
{
sampling
}
.
{
format
}
'
os
.
makedirs
(
'./figures'
,
exist_ok
=
True
)
plt
.
savefig
(
os
.
path
.
join
(
'figures'
,
figname
),
bbox_inches
=
'tight'
)
print
(
f
'Saved figure to ./figures/
{
figname
}
'
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'exp_dir'
,
type
=
str
)
parser
.
add_argument
(
'--duration'
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--warmup'
,
type
=
int
,
default
=
60
)
parser
.
add_argument
(
'--xlim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--ylim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--log'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--format'
,
choices
=
[
'png'
,
'pdf'
],
default
=
'png'
)
args
=
parser
.
parse_args
()
plot_normalized_latency
(
args
.
exp_dir
,
args
.
duration
,
args
.
seed
,
args
.
warmup
,
args
.
xlim
,
args
.
ylim
,
args
.
log
,
args
.
format
)
plot/plot_stats.py
0 → 100644
View file @
84eee24e
import
os
import
pickle
import
matplotlib.pyplot
as
plt
STAT_NAMES
=
[
'input_lens'
,
'num_running'
,
'num_waiting'
,
'num_preemption'
,
'gpu_cache_usage'
,
'cpu_cache_usage'
,
'num_swapped'
,
'swap_in_lens'
,
'swap_out_lens'
,
]
def
plot_stats
(
output_dir
:
str
):
# Get stats.
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'rb'
)
as
f
:
stats
=
pickle
.
load
(
f
)
timestamps
=
stats
[
'timestamps'
]
# Draw one figure for each stat.
num_stats
=
len
(
STAT_NAMES
)
COLORS
=
[
'b'
,
'g'
,
'r'
,
'c'
,
'm'
,
'y'
,
'k'
,
'orange'
,
'purple'
,
'pink'
,
'brown'
,
'gray'
]
fig
,
axs
=
plt
.
subplots
(
num_stats
,
1
,
figsize
=
(
10
,
2
*
num_stats
))
for
i
,
stat
in
enumerate
(
STAT_NAMES
):
data
=
stats
[
stat
]
if
stat
in
[
'gpu_cache_usage'
,
'cpu_cache_usage'
]:
data
=
[
x
*
100
for
x
in
data
]
stat
=
stat
+
' (%)'
axs
[
i
].
plot
(
timestamps
,
data
,
color
=
COLORS
[
i
%
len
(
COLORS
)])
axs
[
i
].
set_ylabel
(
stat
.
replace
(
'_'
,
' '
),
fontdict
=
{
'fontsize'
:
12
})
axs
[
i
].
set_ylim
(
bottom
=
0
)
plt
.
xlabel
(
'Time (s)'
)
plt
.
tight_layout
()
fig_path
=
os
.
path
.
join
(
output_dir
,
'stats.png'
)
plt
.
savefig
(
fig_path
)
print
(
f
'Saved stats to
{
fig_path
}
'
)
if
__name__
==
'__main__'
:
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'output_dir'
,
type
=
str
,
help
=
'Output directory.'
)
args
=
parser
.
parse_args
()
plot_stats
(
args
.
output_dir
)
simple_server.py
View file @
84eee24e
...
@@ -30,6 +30,7 @@ def main(args: argparse.Namespace):
...
@@ -30,6 +30,7 @@ def main(args: argparse.Namespace):
seed
=
args
.
seed
,
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment