Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
7c041ab5
Unverified
Commit
7c041ab5
authored
May 09, 2023
by
Woosuk Kwon
Committed by
GitHub
May 09, 2023
Browse files
Refactor system architecture (#82)
parent
8917782a
Changes
39
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
95 additions
and
343 deletions
+95
-343
cacheflow/model_executor/parallel_utils/__init__.py
cacheflow/model_executor/parallel_utils/__init__.py
+12
-0
cacheflow/model_executor/parallel_utils/parallel_state.py
cacheflow/model_executor/parallel_utils/parallel_state.py
+0
-0
cacheflow/model_executor/parallel_utils/tensor_parallel/__init__.py
...model_executor/parallel_utils/tensor_parallel/__init__.py
+0
-0
cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py
...w/model_executor/parallel_utils/tensor_parallel/layers.py
+1
-1
cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py
...model_executor/parallel_utils/tensor_parallel/mappings.py
+1
-1
cacheflow/model_executor/parallel_utils/tensor_parallel/random.py
...w/model_executor/parallel_utils/tensor_parallel/random.py
+2
-2
cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py
...ow/model_executor/parallel_utils/tensor_parallel/utils.py
+2
-2
cacheflow/model_executor/parallel_utils/utils.py
cacheflow/model_executor/parallel_utils/utils.py
+1
-1
cacheflow/model_executor/utils.py
cacheflow/model_executor/utils.py
+41
-0
cacheflow/model_executor/weight_utils.py
cacheflow/model_executor/weight_utils.py
+27
-35
cacheflow/models/__init__.py
cacheflow/models/__init__.py
+0
-10
cacheflow/utils.py
cacheflow/utils.py
+1
-17
cacheflow/worker/controller.py
cacheflow/worker/controller.py
+1
-1
cacheflow/worker/worker.py
cacheflow/worker/worker.py
+5
-8
gradio_webserver.py
gradio_webserver.py
+0
-0
plot/plot_normalized_latency.py
plot/plot_normalized_latency.py
+0
-212
plot/plot_stats.py
plot/plot_stats.py
+0
-52
simple_server.py
simple_server.py
+1
-1
test_cli_client.py
test_cli_client.py
+0
-0
No files found.
cacheflow/parallel_utils/__init__.py
→
cacheflow/
model_executor/
parallel_utils/__init__.py
View file @
7c041ab5
import
cacheflow.parallel_utils.parallel_state
import
cacheflow.parallel_utils.tensor_parallel
import
cacheflow.parallel_utils.utils
import
cacheflow.
model_executor.
parallel_utils.parallel_state
import
cacheflow.
model_executor.
parallel_utils.tensor_parallel
import
cacheflow.
model_executor.
parallel_utils.utils
# Alias parallel_state as mpu, its legacy name
mpu
=
parallel_state
...
...
cacheflow/parallel_utils/parallel_state.py
→
cacheflow/
model_executor/
parallel_utils/parallel_state.py
View file @
7c041ab5
File moved
cacheflow/parallel_utils/tensor_parallel/__init__.py
→
cacheflow/
model_executor/
parallel_utils/tensor_parallel/__init__.py
View file @
7c041ab5
File moved
cacheflow/parallel_utils/tensor_parallel/layers.py
→
cacheflow/
model_executor/
parallel_utils/tensor_parallel/layers.py
View file @
7c041ab5
...
...
@@ -9,7 +9,7 @@ import torch.nn.functional as F
import
torch.nn.init
as
init
from
torch.nn.parameter
import
Parameter
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.
model_executor.
parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_all_reduce_launcher
,
...
...
cacheflow/parallel_utils/tensor_parallel/mappings.py
→
cacheflow/
model_executor/
parallel_utils/tensor_parallel/mappings.py
View file @
7c041ab5
...
...
@@ -2,7 +2,7 @@
import
torch
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.
model_executor.
parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_group
,
...
...
cacheflow/parallel_utils/tensor_parallel/random.py
→
cacheflow/
model_executor/
parallel_utils/tensor_parallel/random.py
View file @
7c041ab5
...
...
@@ -10,7 +10,7 @@ from torch import _C
from
torch.cuda
import
_lazy_call
,
device
as
device_ctx_manager
from
torch.utils.checkpoint
import
detach_variable
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.
model_executor.
parallel_utils.parallel_state
import
(
get_data_parallel_rank
,
get_tensor_model_parallel_group
,
get_tensor_model_parallel_rank
,
...
...
@@ -22,7 +22,7 @@ from .utils import (
gather_split_1d_tensor
,
)
from
cacheflow.parallel_utils.utils
import
safely_set_viewless_tensor_data
from
cacheflow.
model_executor.
parallel_utils.utils
import
safely_set_viewless_tensor_data
# Default name for the model parallel rng tracker.
_MODEL_PARALLEL_RNG_TRACKER_NAME
=
'model-parallel-rng'
...
...
cacheflow/parallel_utils/tensor_parallel/utils.py
→
cacheflow/
model_executor/
parallel_utils/tensor_parallel/utils.py
View file @
7c041ab5
...
...
@@ -3,8 +3,8 @@
import
torch
from
typing
import
List
,
Sequence
from
cacheflow.parallel_utils.utils
import
divide
from
cacheflow.parallel_utils
import
parallel_state
from
cacheflow.
model_executor.
parallel_utils.utils
import
divide
from
cacheflow.
model_executor.
parallel_utils
import
parallel_state
def
split_tensor_along_last_dim
(
tensor
:
torch
.
Tensor
,
...
...
cacheflow/parallel_utils/utils.py
→
cacheflow/
model_executor/
parallel_utils/utils.py
View file @
7c041ab5
...
...
@@ -4,7 +4,7 @@ import operator
import
torch
from
cacheflow.parallel_utils
import
parallel_state
from
cacheflow.
model_executor.
parallel_utils
import
parallel_state
def
ensure_divisibility
(
numerator
,
denominator
):
...
...
cacheflow/model_executor/utils.py
0 → 100644
View file @
7c041ab5
import
random
from
typing
import
Union
import
numpy
as
np
import
torch
from
cacheflow.model_executor.parallel_utils.parallel_state
import
model_parallel_is_initialized
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
model_parallel_cuda_manual_seed
_STR_DTYPE_TO_TORCH_DTYPE
=
{
'half'
:
torch
.
half
,
'float'
:
torch
.
float
,
'float16'
:
torch
.
float16
,
'float32'
:
torch
.
float32
,
'bfloat16'
:
torch
.
bfloat16
,
}
def
get_torch_dtype
(
dtype
:
Union
[
torch
.
dtype
,
str
])
->
torch
.
dtype
:
if
isinstance
(
dtype
,
str
):
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
.
lower
()]
else
:
torch_dtype
=
dtype
return
torch_dtype
def
get_dtype_size
(
dtype
:
Union
[
torch
.
dtype
,
str
])
->
int
:
torch_dtype
=
get_torch_dtype
(
dtype
)
return
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
def
set_random_seed
(
seed
:
int
)
->
None
:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
if
model_parallel_is_initialized
():
model_parallel_cuda_manual_seed
(
seed
)
cacheflow/model
s/
utils.py
→
cacheflow/model
_executor/weight_
utils.py
View file @
7c041ab5
import
os
import
filelock
import
glob
import
json
import
filelock
from
typing
import
Union
,
Optional
import
os
from
typing
import
Iterator
,
List
,
Optional
,
Tuple
from
huggingface_hub
import
snapshot_download
import
numpy
as
np
import
torch
from
tqdm.auto
import
tqdm
from
huggingface_hub
import
snapshot_download
from
cacheflow.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
)
_STR_DTYPE_TO_TORCH_DTYPE
=
{
'half'
:
torch
.
half
,
'float'
:
torch
.
float
,
'float16'
:
torch
.
float16
,
'float32'
:
torch
.
float32
,
'bfloat16'
:
torch
.
bfloat16
,
}
def
get_torch_dtype
(
dtype
:
Union
[
torch
.
dtype
,
str
])
->
torch
.
dtype
:
if
isinstance
(
dtype
,
str
):
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
.
lower
()]
else
:
torch_dtype
=
dtype
return
torch_dtype
def
get_dtype_size
(
dtype
:
Union
[
torch
.
dtype
,
str
])
->
int
:
torch_dtype
=
get_torch_dtype
(
dtype
)
return
torch
.
tensor
([],
dtype
=
torch_dtype
).
element_size
()
class
Disabledtqdm
(
tqdm
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
,
disable
=
True
)
def
hf_model_weights_iterator
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
use_np_cache
:
bool
=
False
):
def
hf_model_weights_iterator
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
use_np_cache
:
bool
=
False
,
)
->
Iterator
[
Tuple
[
str
,
torch
.
Tensor
]]:
# Prepare file lock directory to prevent multiple processes from
# downloading the same model weights at the same time.
lock_dir
=
cache_dir
if
cache_dir
is
not
None
else
"/tmp"
...
...
@@ -95,10 +74,14 @@ def hf_model_weights_iterator(model_name_or_path: str,
yield
name
,
param
def
load_tensor_parallel_weights
(
param
,
loaded_weight
,
param_name
,
column_parallel_weight_names
,
row_parallel_weight_names
):
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
def
load_tensor_parallel_weights
(
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
,
param_name
:
str
,
column_parallel_weight_names
:
List
[
str
],
row_parallel_weight_names
:
List
[
str
],
tensor_model_parallel_rank
:
int
,
)
->
None
:
for
p
in
column_parallel_weight_names
:
if
p
in
param_name
:
shard_size
=
param
.
shape
[
0
]
...
...
@@ -116,3 +99,12 @@ def load_tensor_parallel_weights(param, loaded_weight, param_name,
break
assert
param
.
shape
==
loaded_weight
.
shape
param
.
data
.
copy_
(
loaded_weight
)
def
initialize_dummy_weights
(
model
:
torch
.
nn
.
Module
,
low
:
float
=
-
1e-3
,
high
:
float
=
1e-3
,
)
->
None
:
for
param
in
model
.
state_dict
().
values
():
param
.
data
.
uniform_
(
low
,
high
)
cacheflow/models/__init__.py
deleted
100644 → 0
View file @
8917782a
from
cacheflow.models.input_metadata
import
InputMetadata
from
cacheflow.models.model_utils
import
get_memory_analyzer
from
cacheflow.models.model_utils
import
get_model
__all__
=
[
'InputMetadata'
,
'get_memory_analyzer'
,
'get_model'
,
]
cacheflow/utils.py
View file @
7c041ab5
import
enum
import
random
import
psutil
import
numpy
as
np
import
psutil
import
torch
from
cacheflow.parallel_utils.parallel_state
import
model_parallel_is_initialized
from
cacheflow.parallel_utils.tensor_parallel
import
model_parallel_cuda_manual_seed
class
Device
(
enum
.
Enum
):
GPU
=
enum
.
auto
()
...
...
@@ -28,17 +23,6 @@ class Counter:
self
.
counter
=
0
def
set_random_seed
(
seed
:
int
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
if
model_parallel_is_initialized
():
model_parallel_cuda_manual_seed
(
seed
)
def
get_gpu_memory
(
gpu
:
int
=
0
)
->
int
:
return
torch
.
cuda
.
get_device_properties
(
gpu
).
total_memory
...
...
cacheflow/worker/controller.py
View file @
7c041ab5
...
...
@@ -5,7 +5,7 @@ try:
except
ImportError
:
ray
=
None
from
cacheflow.
master
.scheduler
import
Scheduler
from
cacheflow.
core
.scheduler
import
Scheduler
from
cacheflow.sequence
import
SequenceGroupInputs
from
cacheflow.worker.worker
import
Worker
...
...
cacheflow/worker/worker.py
View file @
7c041ab5
...
...
@@ -2,18 +2,15 @@ from typing import Dict, List, Tuple, Optional
import
torch
from
cacheflow.models
import
get_model
from
cacheflow.models
import
InputMetadata
from
cacheflow.model_executor
import
get_model
,
InputMetadata
,
set_random_seed
from
cacheflow.model_executor.parallel_utils.parallel_state
import
(
initialize_model_parallel
,
initialize_all_reduce_launcher
,
get_tensor_model_parallel_world_size
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
SequenceGroupInputs
from
cacheflow.sequence
import
SequenceOutputs
from
cacheflow.worker.cache_engine
import
CacheEngine
from
cacheflow.parallel_utils.parallel_state
import
(
initialize_model_parallel
,
initialize_all_reduce_launcher
,
get_tensor_model_parallel_world_size
)
from
cacheflow.utils
import
set_random_seed
class
Worker
:
...
...
cacheflow/http_frontend/
gradio_webserver.py
→
gradio_webserver.py
View file @
7c041ab5
File moved
plot/plot_normalized_latency.py
deleted
100644 → 0
View file @
8917782a
import
argparse
import
os
import
pickle
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
matplotlib.pyplot
as
plt
import
numpy
as
np
SYSTEMS
=
[
'orca-constant'
,
'orca-power2'
,
'orca-oracle'
,
'cacheflow'
,
]
SYSTEM_TO_LABEL
=
{
'orca-constant'
:
'Orca (Max)'
,
'orca-power2'
:
'Orca (Pow2)'
,
'orca-oracle'
:
'Orca (Oracle)'
,
'cacheflow'
:
'KVFlow'
,
}
SYSTEM_TO_COLOR
=
{
'orca-constant'
:
'red'
,
'orca-power2'
:
'orange'
,
'orca-oracle'
:
'green'
,
'cacheflow'
:
'blue'
,
}
SYSTEM_TO_MARKER
=
{
'orca-constant'
:
'x'
,
'orca-power2'
:
'^'
,
'orca-oracle'
:
's'
,
'cacheflow'
:
'o'
,
}
def
get_results
(
save_dir
:
str
)
->
List
[
Dict
[
str
,
Any
]]:
with
open
(
os
.
path
.
join
(
save_dir
,
'sequences.pkl'
),
'rb'
)
as
f
:
results
=
pickle
.
load
(
f
)
return
results
def
get_request_rate
(
save_dir
:
str
)
->
float
:
"""Get request rate from save_dir name."""
# Directory name format:
# .../req-rate-{req_rate}/seed-{seed}/duration-{duration}
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
request_rate
=
None
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'req-rate-'
):
if
request_rate
is
not
None
:
raise
ValueError
(
f
'Found multiple request rates in
{
save_dir
}
'
)
request_rate
=
float
(
dir_name
.
split
(
'-'
)[
-
1
])
if
request_rate
is
None
:
raise
ValueError
(
f
'Cannot find request rate in
{
save_dir
}
'
)
return
request_rate
def
get_model
(
save_dir
:
str
)
->
Tuple
[
str
,
int
]:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
model
=
None
for
dir_name
in
dir_names
:
if
'-tp'
in
dir_name
:
if
model
is
not
None
:
raise
ValueError
(
f
'Found multiple models in
{
save_dir
}
'
)
model
=
dir_name
.
split
(
'-tp'
)[
0
]
tp
=
int
(
dir_name
.
split
(
'-tp'
)[
-
1
])
if
model
is
None
:
raise
ValueError
(
f
'Cannot find model in
{
save_dir
}
'
)
return
model
,
tp
def
get_system
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'orca-'
):
return
dir_name
if
dir_name
==
'cacheflow'
:
return
dir_name
raise
ValueError
(
f
'Cannot find system in
{
save_dir
}
'
)
def
get_sampling
(
save_dir
:
str
)
->
str
:
save_dir
=
os
.
path
.
abspath
(
save_dir
)
dir_names
=
save_dir
.
split
(
'/'
)
for
dir_name
in
dir_names
:
if
dir_name
.
startswith
(
'n'
):
if
dir_name
.
endswith
(
'-beam'
):
return
dir_name
if
dir_name
[
1
:].
isdigit
():
return
dir_name
raise
ValueError
(
f
'Cannot find sampling method in
{
save_dir
}
'
)
def
plot_normalized_latency
(
exp_dir
:
str
,
duration
:
int
,
seed
:
int
,
warmup
:
int
,
xlim
:
Optional
[
float
],
ylim
:
Optional
[
float
],
log_scale
:
bool
,
format
:
str
,
)
->
None
:
# Get leaf directories.
save_dirs
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
exp_dir
):
if
dirs
:
continue
if
'sequences.pkl'
not
in
files
:
continue
if
f
'seed
{
seed
}
'
not
in
root
:
continue
if
f
'duration-
{
duration
}
'
not
in
root
:
continue
save_dirs
.
append
(
root
)
# Plot normalized latency.
perf_per_system
:
Dict
[
str
,
Tuple
[
List
[
float
],
List
[
float
]]]
=
{}
for
save_dir
in
save_dirs
:
per_seq_norm_latencies
=
[]
results
=
get_results
(
save_dir
)
for
seq
in
results
:
arrival_time
=
seq
[
'arrival_time'
]
finish_time
=
seq
[
'finish_time'
]
output_len
=
seq
[
'output_len'
]
if
arrival_time
<
warmup
:
continue
latency
=
finish_time
-
arrival_time
norm_latency
=
latency
/
output_len
per_seq_norm_latencies
.
append
(
norm_latency
)
request_rate
=
get_request_rate
(
save_dir
)
normalized_latency
=
np
.
mean
(
per_seq_norm_latencies
)
system_name
=
get_system
(
save_dir
)
if
system_name
not
in
perf_per_system
:
perf_per_system
[
system_name
]
=
([],
[])
perf_per_system
[
system_name
][
0
].
append
(
request_rate
)
perf_per_system
[
system_name
][
1
].
append
(
normalized_latency
)
print
(
'#seqs'
,
len
(
per_seq_norm_latencies
))
print
(
f
'
{
save_dir
}
:
{
normalized_latency
:.
3
f
}
s'
)
# Plot normalized latency.
plt
.
figure
(
figsize
=
(
6
,
4
))
for
system_name
in
reversed
(
SYSTEMS
):
if
system_name
not
in
perf_per_system
:
continue
# Sort by request rate.
request_rates
,
normalized_latencies
=
perf_per_system
[
system_name
]
request_rates
,
normalized_latencies
=
zip
(
*
sorted
(
zip
(
request_rates
,
normalized_latencies
)))
label
=
SYSTEM_TO_LABEL
[
system_name
]
color
=
SYSTEM_TO_COLOR
[
system_name
]
marker
=
SYSTEM_TO_MARKER
[
system_name
]
plt
.
plot
(
request_rates
,
normalized_latencies
,
label
=
label
,
color
=
color
,
marker
=
marker
)
# plt.legend()
plt
.
xlabel
(
'Request rate (req/s)'
,
fontsize
=
12
)
plt
.
ylabel
(
'Normalized latency (s/token)'
,
fontsize
=
12
)
if
log_scale
:
plt
.
yscale
(
'log'
)
if
xlim
is
not
None
:
plt
.
xlim
(
left
=
0
,
right
=
xlim
)
if
ylim
is
not
None
:
if
log_scale
:
plt
.
ylim
(
top
=
ylim
)
else
:
plt
.
ylim
(
bottom
=
0
,
top
=
ylim
)
handles
,
labels
=
plt
.
gca
().
get_legend_handles_labels
()
handles
=
reversed
(
handles
)
labels
=
reversed
(
labels
)
plt
.
legend
(
handles
,
labels
,
ncol
=
4
,
fontsize
=
12
,
loc
=
'upper center'
,
bbox_to_anchor
=
(
0.5
,
1.15
),
columnspacing
=
0.5
,
handletextpad
=
0.5
,
handlelength
=
1.5
,
frameon
=
False
,
borderpad
=
0
)
# Save figure.
model
,
tp
=
get_model
(
exp_dir
)
sampling
=
get_sampling
(
exp_dir
)
figname
=
f
'
{
model
}
-tp
{
tp
}
-
{
sampling
}
.
{
format
}
'
os
.
makedirs
(
'./figures'
,
exist_ok
=
True
)
plt
.
savefig
(
os
.
path
.
join
(
'figures'
,
figname
),
bbox_inches
=
'tight'
)
print
(
f
'Saved figure to ./figures/
{
figname
}
'
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'exp_dir'
,
type
=
str
)
parser
.
add_argument
(
'--duration'
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--warmup'
,
type
=
int
,
default
=
60
)
parser
.
add_argument
(
'--xlim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--ylim'
,
type
=
float
,
required
=
False
,
default
=
None
)
parser
.
add_argument
(
'--log'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--format'
,
choices
=
[
'png'
,
'pdf'
],
default
=
'png'
)
args
=
parser
.
parse_args
()
plot_normalized_latency
(
args
.
exp_dir
,
args
.
duration
,
args
.
seed
,
args
.
warmup
,
args
.
xlim
,
args
.
ylim
,
args
.
log
,
args
.
format
)
plot/plot_stats.py
deleted
100644 → 0
View file @
8917782a
import
os
import
pickle
import
matplotlib.pyplot
as
plt
STAT_NAMES
=
[
'input_lens'
,
'num_running'
,
'num_waiting'
,
'num_preemption'
,
'gpu_cache_usage'
,
'cpu_cache_usage'
,
'num_swapped'
,
'swap_in_lens'
,
'swap_out_lens'
,
]
def
plot_stats
(
output_dir
:
str
):
# Get stats.
with
open
(
os
.
path
.
join
(
output_dir
,
'stats.pkl'
),
'rb'
)
as
f
:
stats
=
pickle
.
load
(
f
)
timestamps
=
stats
[
'timestamps'
]
# Draw one figure for each stat.
num_stats
=
len
(
STAT_NAMES
)
COLORS
=
[
'b'
,
'g'
,
'r'
,
'c'
,
'm'
,
'y'
,
'k'
,
'orange'
,
'purple'
,
'pink'
,
'brown'
,
'gray'
]
fig
,
axs
=
plt
.
subplots
(
num_stats
,
1
,
figsize
=
(
10
,
2
*
num_stats
))
for
i
,
stat
in
enumerate
(
STAT_NAMES
):
data
=
stats
[
stat
]
if
stat
in
[
'gpu_cache_usage'
,
'cpu_cache_usage'
]:
data
=
[
x
*
100
for
x
in
data
]
stat
=
stat
+
' (%)'
axs
[
i
].
plot
(
timestamps
,
data
,
color
=
COLORS
[
i
%
len
(
COLORS
)])
axs
[
i
].
set_ylabel
(
stat
.
replace
(
'_'
,
' '
),
fontdict
=
{
'fontsize'
:
12
})
axs
[
i
].
set_ylim
(
bottom
=
0
)
plt
.
xlabel
(
'Time (s)'
)
plt
.
tight_layout
()
fig_path
=
os
.
path
.
join
(
output_dir
,
'stats.png'
)
plt
.
savefig
(
fig_path
)
print
(
f
'Saved stats to
{
fig_path
}
'
)
if
__name__
==
'__main__'
:
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'output_dir'
,
type
=
str
,
help
=
'Output directory.'
)
args
=
parser
.
parse_args
()
plot_stats
(
args
.
output_dir
)
simple_server.py
View file @
7c041ab5
import
argparse
from
cacheflow.
master
.server
import
(
from
cacheflow.
core
.server
import
(
add_server_arguments
,
process_server_arguments
,
init_local_server_and_frontend_with_arguments
)
from
cacheflow.sampling_params
import
SamplingParams
...
...
cacheflow/http_frontend/
test_cli_client.py
→
test_cli_client.py
View file @
7c041ab5
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment