Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
4858f3bb
Unverified
Commit
4858f3bb
authored
Apr 30, 2023
by
Zhuohan Li
Committed by
GitHub
Apr 30, 2023
Browse files
Add an option to launch cacheflow without ray (#51)
parent
a96d63c2
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
102 additions
and
28 deletions
+102
-28
.gitignore
.gitignore
+3
-0
benchmark/benchmark_latency.py
benchmark/benchmark_latency.py
+8
-4
benchmark/benchmark_text_completion.py
benchmark/benchmark_text_completion.py
+10
-6
cacheflow/http_frontend/fastapi_frontend.py
cacheflow/http_frontend/fastapi_frontend.py
+15
-3
cacheflow/master/server.py
cacheflow/master/server.py
+39
-4
cacheflow/worker/controller.py
cacheflow/worker/controller.py
+21
-9
simple_server.py
simple_server.py
+6
-2
No files found.
.gitignore
View file @
4858f3bb
...
@@ -3,8 +3,11 @@
...
@@ -3,8 +3,11 @@
*.egg-info/
*.egg-info/
*.eggs/
*.eggs/
*.so
*.so
*.log
*.csv
build/
build/
*.pkl
*.pkl
*.png
*.png
**/log.txt
**/log.txt
.vscode/
benchmark/benchmark_latency.py
View file @
4858f3bb
...
@@ -8,7 +8,8 @@ import torch
...
@@ -8,7 +8,8 @@ import torch
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
process_server_arguments
,
initialize_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
...
@@ -20,8 +21,8 @@ def main(args: argparse.Namespace):
...
@@ -20,8 +21,8 @@ def main(args: argparse.Namespace):
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
all_stage_devices
)
=
(
initialize_
ray_
cluster
(
initialize_cluster
(
address
=
'local'
,
use_ray
=
args
.
use_ray
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
tensor_parallel_size
=
args
.
tensor_parallel_size
))
...
@@ -44,6 +45,7 @@ def main(args: argparse.Namespace):
...
@@ -44,6 +45,7 @@ def main(args: argparse.Namespace):
all_stage_devices
=
all_stage_devices
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
args
.
use_ray
,
)
)
# Create a frontend.
# Create a frontend.
...
@@ -91,7 +93,8 @@ def main(args: argparse.Namespace):
...
@@ -91,7 +93,8 @@ def main(args: argparse.Namespace):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Benchmark the latency of decoding a single sentence.'
)
parser
=
add_server_arguments
(
parser
)
parser
=
add_server_arguments
(
parser
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
128
)
...
@@ -99,6 +102,7 @@ if __name__ == '__main__':
...
@@ -99,6 +102,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--n'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--n'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--use-beam-search'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use-beam-search'
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
args
=
process_server_arguments
(
args
)
args
.
max_num_batched_tokens
=
max
(
args
.
max_num_batched_tokens
=
max
(
args
.
max_num_batched_tokens
,
args
.
batch_size
*
args
.
input_len
)
args
.
max_num_batched_tokens
,
args
.
batch_size
*
args
.
input_len
)
print
(
args
)
print
(
args
)
...
...
benchmark/benchmark_text_completion.py
View file @
4858f3bb
...
@@ -11,7 +11,8 @@ from transformers import AutoConfig
...
@@ -11,7 +11,8 @@ from transformers import AutoConfig
from
benchmark.trace
import
generate_text_completion_requests
from
benchmark.trace
import
generate_text_completion_requests
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
process_server_arguments
,
initialize_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
...
@@ -25,8 +26,8 @@ def main(args: argparse.Namespace):
...
@@ -25,8 +26,8 @@ def main(args: argparse.Namespace):
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
all_stage_devices
)
=
(
initialize_
ray_
cluster
(
initialize_cluster
(
address
=
'local'
,
use_ray
=
args
.
use_ray
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
tensor_parallel_size
=
args
.
tensor_parallel_size
))
...
@@ -49,6 +50,7 @@ def main(args: argparse.Namespace):
...
@@ -49,6 +50,7 @@ def main(args: argparse.Namespace):
all_stage_devices
=
all_stage_devices
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
args
.
use_ray
,
collect_stats
=
True
,
collect_stats
=
True
,
do_memory_analysis
=
args
.
do_memory_analysis
,
do_memory_analysis
=
args
.
do_memory_analysis
,
)
)
...
@@ -134,7 +136,7 @@ def main(args: argparse.Namespace):
...
@@ -134,7 +136,7 @@ def main(args: argparse.Namespace):
finished
.
append
({
finished
.
append
({
'group_id'
:
seq_group
.
group_id
,
'group_id'
:
seq_group
.
group_id
,
'seq_id'
:
seq
.
seq_id
,
'seq_id'
:
seq
.
seq_id
,
'arrival_time'
:
arrival_time
,
'arrival_time'
:
arrival_time
,
'finish_time'
:
finish_time
,
'finish_time'
:
finish_time
,
'prompt_len'
:
seq
.
prompt_len
,
'prompt_len'
:
seq
.
prompt_len
,
'output_len'
:
output_len
,
'output_len'
:
output_len
,
...
@@ -225,8 +227,9 @@ def get_sampling_dir_name(
...
@@ -225,8 +227,9 @@ def get_sampling_dir_name(
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
argparse
.
ArgumentParser
(
parser
=
add_server_arguments
(
parser
)
description
=
'Benchmark the performance on a series of requests.'
)
parser
=
add_server_arguments
(
parser
)
parser
.
add_argument
(
'--output-dir'
,
type
=
str
,
help
=
'path to output directory'
,
default
=
None
)
parser
.
add_argument
(
'--output-dir'
,
type
=
str
,
help
=
'path to output directory'
,
default
=
None
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'path to dataset'
,
required
=
True
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
help
=
'path to dataset'
,
required
=
True
)
...
@@ -246,6 +249,7 @@ if __name__ == '__main__':
...
@@ -246,6 +249,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--n6-beam'
,
type
=
float
,
help
=
'ratio of requests with n=6 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n6-beam'
,
type
=
float
,
help
=
'ratio of requests with n=6 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n8-beam'
,
type
=
float
,
help
=
'ratio of requests with n=8 & beam search'
,
default
=
0.0
)
parser
.
add_argument
(
'--n8-beam'
,
type
=
float
,
help
=
'ratio of requests with n=8 & beam search'
,
default
=
0.0
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
args
=
process_server_arguments
(
args
)
if
args
.
n1
+
args
.
n2
+
args
.
n3
+
args
.
n4
+
args
.
n6
+
args
.
n2_beam
+
args
.
n4_beam
+
args
.
n6_beam
+
args
.
n8_beam
!=
1.0
:
if
args
.
n1
+
args
.
n2
+
args
.
n3
+
args
.
n4
+
args
.
n6
+
args
.
n2_beam
+
args
.
n4_beam
+
args
.
n6_beam
+
args
.
n8_beam
!=
1.0
:
raise
ValueError
(
'The ratios of requests must sum to 1.'
)
raise
ValueError
(
'The ratios of requests must sum to 1.'
)
...
...
cacheflow/http_frontend/fastapi_frontend.py
View file @
4858f3bb
...
@@ -13,7 +13,8 @@ import uvicorn
...
@@ -13,7 +13,8 @@ import uvicorn
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
process_server_arguments
,
initialize_cluster
)
from
cacheflow.worker.controller
import
DeviceID
from
cacheflow.worker.controller
import
DeviceID
from
cacheflow.utils
import
Counter
,
get_gpu_memory
,
get_cpu_memory
from
cacheflow.utils
import
Counter
,
get_gpu_memory
,
get_cpu_memory
...
@@ -33,17 +34,22 @@ class FastAPIFrontend:
...
@@ -33,17 +34,22 @@ class FastAPIFrontend:
seed
:
int
,
seed
:
int
,
swap_space
:
int
,
swap_space
:
int
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
num_nodes
:
int
,
num_nodes
:
int
,
num_devices_per_node
:
int
,
num_devices_per_node
:
int
,
distributed_init_method
:
str
,
distributed_init_method
:
str
,
all_stage_devices
:
List
[
List
[
DeviceID
]],
all_stage_devices
:
List
[
List
[
DeviceID
]],
server_use_ray
:
bool
,
):
):
self
.
block_size
=
block_size
self
.
block_size
=
block_size
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
self
.
seq_group_counter
=
Counter
()
self
.
seq_group_counter
=
Counter
()
self
.
seq_counter
=
Counter
()
self
.
seq_counter
=
Counter
()
remote_server_class
=
ray
.
remote
(
num_cpus
=
0
)(
Server
)
if
server_use_ray
:
remote_server_class
=
ray
.
remote
(
num_cpus
=
0
)(
Server
)
else
:
remote_server_class
=
ray
.
remote
(
num_gpus
=
1
)(
Server
)
self
.
server
=
remote_server_class
.
remote
(
self
.
server
=
remote_server_class
.
remote
(
model
=
model
,
model
=
model
,
model_path
=
model_path
,
model_path
=
model_path
,
...
@@ -55,12 +61,14 @@ class FastAPIFrontend:
...
@@ -55,12 +61,14 @@ class FastAPIFrontend:
seed
=
seed
,
seed
=
seed
,
swap_space
=
swap_space
,
swap_space
=
swap_space
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_sequences
=
max_num_sequences
,
num_nodes
=
num_nodes
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
all_stage_devices
=
all_stage_devices
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
server_use_ray
,
)
)
self
.
running_seq_groups
:
Dict
[
int
,
SequenceGroup
]
=
{}
self
.
running_seq_groups
:
Dict
[
int
,
SequenceGroup
]
=
{}
...
@@ -149,6 +157,7 @@ if __name__ == "__main__":
...
@@ -149,6 +157,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
10002
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
10002
)
parser
=
add_server_arguments
(
parser
)
parser
=
add_server_arguments
(
parser
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
args
=
process_server_arguments
(
args
)
# TODO(zhuohan): Support pipeline parallelism.
# TODO(zhuohan): Support pipeline parallelism.
assert
args
.
pipeline_parallel_size
==
1
,
(
assert
args
.
pipeline_parallel_size
==
1
,
(
...
@@ -156,7 +165,8 @@ if __name__ == "__main__":
...
@@ -156,7 +165,8 @@ if __name__ == "__main__":
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
all_stage_devices
)
=
(
initialize_ray_cluster
(
initialize_cluster
(
use_ray
=
True
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
tensor_parallel_size
=
args
.
tensor_parallel_size
))
...
@@ -170,10 +180,12 @@ if __name__ == "__main__":
...
@@ -170,10 +180,12 @@ if __name__ == "__main__":
seed
=
args
.
seed
,
seed
=
args
.
seed
,
swap_space
=
args
.
swap_space
,
swap_space
=
args
.
swap_space
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_sequences
=
args
.
max_num_sequences
,
num_nodes
=
num_nodes
,
num_nodes
=
num_nodes
,
num_devices_per_node
=
num_devices_per_node
,
num_devices_per_node
=
num_devices_per_node
,
distributed_init_method
=
distributed_init_method
,
distributed_init_method
=
distributed_init_method
,
all_stage_devices
=
all_stage_devices
,
all_stage_devices
=
all_stage_devices
,
server_use_ray
=
args
.
use_ray
,
)
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"info"
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"info"
)
cacheflow/master/server.py
View file @
4858f3bb
import
argparse
import
argparse
from
typing
import
List
,
Tuple
from
typing
import
List
,
Tuple
,
Optional
import
random
import
random
import
ray
import
torch
try
:
import
ray
except
ImportError
:
ray
=
None
from
cacheflow.master.scheduler
import
Scheduler
from
cacheflow.master.scheduler
import
Scheduler
from
cacheflow.models
import
get_memory_analyzer
from
cacheflow.models
import
get_memory_analyzer
...
@@ -31,6 +35,7 @@ class Server:
...
@@ -31,6 +35,7 @@ class Server:
all_stage_devices
:
List
[
List
[
DeviceID
]],
all_stage_devices
:
List
[
List
[
DeviceID
]],
gpu_memory
:
int
,
gpu_memory
:
int
,
cpu_memory
:
int
,
cpu_memory
:
int
,
use_ray
:
bool
,
collect_stats
:
bool
=
False
,
collect_stats
:
bool
=
False
,
do_memory_analysis
:
bool
=
False
,
do_memory_analysis
:
bool
=
False
,
):
):
...
@@ -38,6 +43,10 @@ class Server:
...
@@ -38,6 +43,10 @@ class Server:
self
.
num_devices_per_node
=
num_devices_per_node
self
.
num_devices_per_node
=
num_devices_per_node
self
.
world_size
=
pipeline_parallel_size
*
tensor_parallel_size
self
.
world_size
=
pipeline_parallel_size
*
tensor_parallel_size
if
not
use_ray
:
assert
self
.
world_size
==
1
,
(
"Only support single GPU without Ray."
)
self
.
memory_analyzer
=
get_memory_analyzer
(
self
.
memory_analyzer
=
get_memory_analyzer
(
model_name
=
model
,
model_name
=
model
,
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -72,6 +81,7 @@ class Server:
...
@@ -72,6 +81,7 @@ class Server:
model_path
=
model_path
,
model_path
=
model_path
,
use_dummy_weights
=
use_dummy_weights
,
use_dummy_weights
=
use_dummy_weights
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
use_ray
=
use_ray
,
)
)
self
.
controllers
.
append
(
controller
)
self
.
controllers
.
append
(
controller
)
...
@@ -105,11 +115,30 @@ class Server:
...
@@ -105,11 +115,30 @@ class Server:
self
.
scheduler
.
swapped
)
self
.
scheduler
.
swapped
)
def
initialize_ray_cluster
(
def
initialize_cluster
(
address
:
str
=
'auto'
,
use_ray
:
bool
=
False
,
address
:
Optional
[
str
]
=
None
,
pipeline_parallel_size
:
int
=
1
,
pipeline_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
)
->
Tuple
[
int
,
int
,
str
,
List
[
List
[
DeviceID
]]]:
)
->
Tuple
[
int
,
int
,
str
,
List
[
List
[
DeviceID
]]]:
# Initialize cluster locally.
if
not
use_ray
:
assert
pipeline_parallel_size
*
tensor_parallel_size
==
1
,
(
"Only support single GPU without Ray."
)
num_nodes
=
1
num_devices_per_node
=
torch
.
cuda
.
device_count
()
port
=
random
.
randint
(
10000
,
20000
)
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method
=
f
"tcp://localhost:
{
port
}
"
all_stage_devices
=
[[(
0
,
None
,
0
)]]
return
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
assert
ray
is
not
None
,
(
"Ray is not installed. Please install Ray to use distributed "
"serving."
)
# Connect to a ray cluster.
# Connect to a ray cluster.
ray
.
init
(
address
=
address
)
ray
.
init
(
address
=
address
)
...
@@ -177,6 +206,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
...
@@ -177,6 +206,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
parser
.
add_argument
(
'--model-path'
,
type
=
str
,
default
=
'~/.cacheflow/model_weights'
,
parser
.
add_argument
(
'--model-path'
,
type
=
str
,
default
=
'~/.cacheflow/model_weights'
,
help
=
'model path to download and load the weights'
)
help
=
'model path to download and load the weights'
)
# Parallel arguments
# Parallel arguments
parser
.
add_argument
(
'--use-ray'
,
action
=
'store_true'
,
help
=
'use Ray for distributed serving, will be automatically set when using more than 1 GPU'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
1
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
1
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
,
help
=
'number of tensor parallel replicas'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
,
help
=
'number of tensor parallel replicas'
)
# KV cache arguments
# KV cache arguments
...
@@ -190,3 +220,8 @@ def add_server_arguments(parser: argparse.ArgumentParser):
...
@@ -190,3 +220,8 @@ def add_server_arguments(parser: argparse.ArgumentParser):
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--max-num-sequences'
,
type
=
int
,
default
=
256
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--use-dummy-weights'
,
action
=
'store_true'
,
help
=
'use dummy values for model weights'
)
parser
.
add_argument
(
'--use-dummy-weights'
,
action
=
'store_true'
,
help
=
'use dummy values for model weights'
)
return
parser
return
parser
def
process_server_arguments
(
args
:
argparse
.
Namespace
):
if
args
.
pipeline_parallel_size
*
args
.
tensor_parallel_size
>
1
:
args
.
use_ray
=
True
return
args
cacheflow/worker/controller.py
View file @
4858f3bb
from
typing
import
Dict
,
List
,
Union
,
Tuple
from
typing
import
Dict
,
List
,
Union
,
Tuple
import
ray
try
:
import
ray
except
ImportError
:
ray
=
None
from
cacheflow.master.scheduler
import
Scheduler
from
cacheflow.master.scheduler
import
Scheduler
from
cacheflow.sequence
import
SequenceGroupInputs
from
cacheflow.sequence
import
SequenceGroupInputs
...
@@ -29,6 +32,7 @@ class Controller:
...
@@ -29,6 +32,7 @@ class Controller:
model_path
:
str
,
model_path
:
str
,
use_dummy_weights
:
bool
,
use_dummy_weights
:
bool
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
use_ray
:
bool
,
)
->
None
:
)
->
None
:
self
.
stage_id
=
stage_id
self
.
stage_id
=
stage_id
self
.
stage_devices
=
stage_devices
self
.
stage_devices
=
stage_devices
...
@@ -36,6 +40,7 @@ class Controller:
...
@@ -36,6 +40,7 @@ class Controller:
self
.
block_size
=
block_size
self
.
block_size
=
block_size
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
num_cpu_blocks
=
num_cpu_blocks
self
.
use_ray
=
use_ray
# Which pipeline stage is this node assigned to?
# Which pipeline stage is this node assigned to?
self
.
is_first_stage
=
stage_id
==
0
self
.
is_first_stage
=
stage_id
==
0
...
@@ -43,10 +48,13 @@ class Controller:
...
@@ -43,10 +48,13 @@ class Controller:
self
.
workers
:
List
[
Worker
]
=
[]
self
.
workers
:
List
[
Worker
]
=
[]
for
rank
,
node_resource
,
device_id
in
stage_devices
:
for
rank
,
node_resource
,
device_id
in
stage_devices
:
worker_cls
=
ray
.
remote
(
num_cpus
=
0
,
if
self
.
use_ray
:
num_gpus
=
1
,
worker_cls
=
ray
.
remote
(
num_cpus
=
0
,
resources
=
{
node_resource
:
1e-5
})(
Worker
)
num_gpus
=
1
,
worker
=
worker_cls
.
remote
(
resources
=
{
node_resource
:
1e-5
})(
Worker
).
remote
else
:
worker_cls
=
Worker
worker
=
worker_cls
(
model_name
=
model_name
,
model_name
=
model_name
,
block_size
=
block_size
,
block_size
=
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
,
...
@@ -78,17 +86,21 @@ class Controller:
...
@@ -78,17 +86,21 @@ class Controller:
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
)
->
None
:
future
s
=
[]
all_output
s
=
[]
for
worker
in
self
.
workers
:
for
worker
in
self
.
workers
:
future
=
worker
.
execute_stage
.
remote
(
executor
=
(
worker
.
execute_stage
.
remote
if
self
.
use_ray
else
worker
.
execute_stage
)
output
=
executor
(
input_seq_groups
,
input_seq_groups
,
blocks_to_swap_in
,
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_swap_out
,
blocks_to_copy
,
blocks_to_copy
,
)
)
futures
.
append
(
future
)
all_outputs
.
append
(
output
)
if
self
.
use_ray
:
all_outputs
=
ray
.
get
(
all_outputs
)
all_outputs
=
ray
.
get
(
futures
)
# Make sure all workers have the same results.
# Make sure all workers have the same results.
output
=
all_outputs
[
0
]
output
=
all_outputs
[
0
]
for
other_output
in
all_outputs
[
1
:]:
for
other_output
in
all_outputs
[
1
:]:
...
...
simple_server.py
View file @
4858f3bb
...
@@ -3,7 +3,8 @@ from typing import List
...
@@ -3,7 +3,8 @@ from typing import List
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
initialize_ray_cluster
)
process_server_arguments
,
initialize_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
...
@@ -14,7 +15,8 @@ def main(args: argparse.Namespace):
...
@@ -14,7 +15,8 @@ def main(args: argparse.Namespace):
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
(
num_nodes
,
num_devices_per_node
,
distributed_init_method
,
all_stage_devices
)
=
(
all_stage_devices
)
=
(
initialize_ray_cluster
(
initialize_cluster
(
use_ray
=
args
.
use_ray
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
pipeline_parallel_size
=
args
.
pipeline_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
))
tensor_parallel_size
=
args
.
tensor_parallel_size
))
...
@@ -37,6 +39,7 @@ def main(args: argparse.Namespace):
...
@@ -37,6 +39,7 @@ def main(args: argparse.Namespace):
all_stage_devices
=
all_stage_devices
,
all_stage_devices
=
all_stage_devices
,
gpu_memory
=
get_gpu_memory
(),
gpu_memory
=
get_gpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
cpu_memory
=
get_cpu_memory
(),
use_ray
=
args
.
use_ray
,
)
)
# Create a frontend.
# Create a frontend.
...
@@ -70,4 +73,5 @@ if __name__ == '__main__':
...
@@ -70,4 +73,5 @@ if __name__ == '__main__':
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
add_server_arguments
(
parser
)
parser
=
add_server_arguments
(
parser
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
args
=
process_server_arguments
(
args
)
main
(
args
)
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment