Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
c3442c1f
Unverified
Commit
c3442c1f
authored
May 20, 2023
by
Woosuk Kwon
Committed by
GitHub
May 20, 2023
Browse files
Refactor system architecture (#109)
parent
7297fa6f
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
117 additions
and
131 deletions
+117
-131
cacheflow/worker/worker.py
cacheflow/worker/worker.py
+73
-93
examples/simple_server.py
examples/simple_server.py
+44
-0
simple_server.py
simple_server.py
+0
-38
No files found.
cacheflow/worker/worker.py
View file @
c3442c1f
"""A GPU worker class."""
"""A GPU worker class."""
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Tuple
import
torch
import
torch
from
cacheflow.model_executor
import
(
get_model
,
get_cache_block_size
,
from
cacheflow.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
InputMetadata
,
set_random_seed
)
SchedulerConfig
)
from
cacheflow.model_executor
import
get_model
,
InputMetadata
,
set_random_seed
from
cacheflow.model_executor.parallel_utils.parallel_state
import
(
from
cacheflow.model_executor.parallel_utils.parallel_state
import
(
initialize_model_parallel
,
initialize_model_parallel
,
initialize_all_reduce_launcher
)
initialize_all_reduce_launcher
,
get_tensor_model_parallel_world_size
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
(
SequenceData
,
SequenceGroupMetadata
,
from
cacheflow.sequence
import
(
SequenceData
,
SequenceGroupMetadata
,
SequenceOutputs
)
SequenceOutputs
)
...
@@ -26,59 +25,46 @@ class Worker:
...
@@ -26,59 +25,46 @@ class Worker:
def
__init__
(
def
__init__
(
self
,
self
,
model_name
:
str
,
model_config
:
ModelConfig
,
dtype
:
str
,
parallel_config
:
ParallelConfig
,
seed
:
int
,
scheduler_config
:
SchedulerConfig
,
distributed_init_method
:
str
,
rank
:
int
,
rank
:
int
,
world_size
:
int
,
distributed_init_method
:
str
,
cache_dir
:
Optional
[
str
],
use_dummy_weights
:
bool
,
use_np_cache
:
bool
,
max_num_batched_tokens
:
int
,
max_num_sequences
:
int
,
tensor_parallel_size
:
int
=
1
,
pipeline_parallel_size
:
int
=
1
,
)
->
None
:
)
->
None
:
self
.
init_distributed_environment
(
distributed_init_method
,
self
.
model_config
=
model_config
rank
,
self
.
parallel_config
=
parallel_config
world_size
,
self
.
scheduler_config
=
scheduler_config
tensor_parallel_size
,
self
.
rank
=
rank
pipeline_parallel_size
)
self
.
distributed_init_method
=
distributed_init_method
self
.
worker_id
=
rank
self
.
seed
=
seed
# Initialize the distributed environment.
set_random_seed
(
self
.
seed
)
_init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
# Initialize the model.
# Initialize the model.
self
.
model
,
self
.
dtype
=
get_model
(
set_random_seed
(
self
.
model_config
.
seed
)
model_name
,
dtype
=
dtype
,
cache_dir
=
cache_dir
,
self
.
model
=
get_model
(
model_config
)
use_dummy_weights
=
use_dummy_weights
,
use_np_cache
=
use_np_cache
)
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
self
.
max_num_batched_tokens
=
max_num_batched_tokens
initialize_all_reduce_launcher
(
initialize_all_reduce_launcher
(
self
.
max_num_batched_tokens
,
self
.
model
.
config
.
hidden_size
,
self
.
dtype
)
self
.
scheduler_config
.
max_num_batched_tokens
,
self
.
max_num_sequences
=
max_num_sequences
self
.
model_config
.
get_hidden_size
(),
self
.
num_layers
=
self
.
model
.
config
.
num_hidden_layers
self
.
model_config
.
dtype
,
assert
self
.
model
.
config
.
num_attention_heads
%
tensor_model_parallel_world_size
==
0
)
self
.
num_heads
=
self
.
model
.
config
.
num_attention_heads
//
tensor_model_parallel_world_size
self
.
head_size
=
self
.
model
.
config
.
hidden_size
//
(
self
.
num_heads
*
tensor_model_parallel_world_size
)
# Uninitialized cache engine. Will be initialized by
# We reset the seed after initializing the model to ensure that
# the random state is not affected by the model initialization.
set_random_seed
(
seed
)
# Uninitialized cache engine. Will be initialized with
# self.init_cache_engine().
# self.init_cache_engine().
self
.
cache_config
=
None
self
.
block_size
=
None
self
.
block_size
=
None
self
.
cache_engine
=
None
self
.
cache_engine
=
None
self
.
cache_events
=
None
self
.
cache_events
=
None
self
.
gpu_cache
=
None
self
.
gpu_cache
=
None
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
get_num_available_blocks
(
def
profile_num_available_blocks
(
self
,
block_size
:
int
,
cpu_swap_space
:
int
,
self
,
gpu_memory_utilization
:
float
)
->
Tuple
[
int
,
int
]:
block_size
:
int
,
gpu_memory_utilization
:
float
,
cpu_swap_space
:
int
,
)
->
Tuple
[
int
,
int
]:
# Profile the memory usage of the model and get the maximum number of
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
...
@@ -90,14 +76,15 @@ class Worker:
...
@@ -90,14 +76,15 @@ class Worker:
# Enable top-k sampling to reflect the accurate memory usage.
# Enable top-k sampling to reflect the accurate memory usage.
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
self
.
model
.
config
.
vocab_size
-
1
)
top_k
=
self
.
model
.
config
.
vocab_size
-
1
)
max_num_batched_tokens
=
self
.
scheduler_config
.
max_num_batched_tokens
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
seqs
=
[]
seqs
=
[]
for
group_id
in
range
(
self
.
max_num_sequences
):
for
group_id
in
range
(
max_num_seqs
):
seq_len
=
(
self
.
max_num_batched_tokens
//
self
.
max_num_sequences
+
seq_len
=
(
max_num_batched_tokens
//
max_num_seqs
+
(
group_id
<
self
.
max_num_batched_tokens
%
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
self
.
max_num_sequences
))
seq_data
=
SequenceData
([
0
]
*
seq_len
)
seq_data
=
SequenceData
([
0
]
*
seq_len
)
seq
=
SequenceGroupMetadata
(
seq
=
SequenceGroupMetadata
(
group_id
=
group_id
,
request_id
=
str
(
group_id
)
,
is_prompt
=
True
,
is_prompt
=
True
,
seq_data
=
{
group_id
:
seq_data
},
seq_data
=
{
group_id
:
seq_data
},
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
...
@@ -105,13 +92,14 @@ class Worker:
...
@@ -105,13 +92,14 @@ class Worker:
)
)
seqs
.
append
(
seq
)
seqs
.
append
(
seq
)
input_tokens
,
input_positions
,
input_metadata
=
self
.
prepare_inputs
(
seqs
)
input_tokens
,
input_positions
,
input_metadata
=
self
.
_
prepare_inputs
(
seqs
)
# Execute the model.
# Execute the model.
num_layers
=
self
.
model_config
.
get_num_layers
(
self
.
parallel_config
)
self
.
model
(
self
.
model
(
input_ids
=
input_tokens
,
input_ids
=
input_tokens
,
positions
=
input_positions
,
positions
=
input_positions
,
kv_caches
=
[(
None
,
None
)]
*
self
.
num_layers
,
kv_caches
=
[(
None
,
None
)]
*
num_layers
,
input_metadata
=
input_metadata
,
input_metadata
=
input_metadata
,
cache_events
=
None
,
cache_events
=
None
,
)
)
...
@@ -121,53 +109,27 @@ class Worker:
...
@@ -121,53 +109,27 @@ class Worker:
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
peak_memory
=
torch
.
cuda
.
max_memory_allocated
()
peak_memory
=
torch
.
cuda
.
max_memory_allocated
()
total_gpu_memory
=
get_gpu_memory
()
total_gpu_memory
=
get_gpu_memory
()
cache_block_size
=
get_cache_block_size
(
block_size
,
self
.
num_heads
,
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
self
.
head_size
,
self
.
num_layers
,
block_size
,
self
.
model_config
,
self
.
parallel_config
)
self
.
dtype
)
num_gpu_blocks
=
int
((
total_gpu_memory
*
gpu_memory_utilization
num_gpu_blocks
=
int
((
total_gpu_memory
*
gpu_memory_utilization
-
peak_memory
)
//
cache_block_size
)
-
peak_memory
)
//
cache_block_size
)
num_cpu_blocks
=
int
(
cpu_swap_space
//
cache_block_size
)
num_cpu_blocks
=
int
(
cpu_swap_space
//
cache_block_size
)
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
# Reset the seed to ensure that the model output is not affected by
# the profiling.
# Reset the seed to ensure that the random state is not affected by
set_random_seed
(
self
.
seed
)
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
return
num_gpu_blocks
,
num_cpu_blocks
return
num_gpu_blocks
,
num_cpu_blocks
def
init_cache_engine
(
self
,
block_size
:
int
,
num_gpu_blocks
:
int
,
def
init_cache_engine
(
self
,
cache_config
:
CacheConfig
)
->
None
:
num_cpu_blocks
:
int
):
self
.
cache_config
=
cache_config
self
.
block_size
=
block_size
self
.
block_size
=
cache_config
.
block_size
self
.
cache_engine
=
CacheEngine
(
self
.
cache_engine
=
CacheEngine
(
worker_id
=
self
.
worker_id
,
self
.
cache_config
,
self
.
model_config
,
self
.
parallel_config
)
num_layers
=
self
.
num_layers
,
num_heads
=
self
.
num_heads
,
head_size
=
self
.
head_size
,
block_size
=
self
.
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
,
dtype
=
self
.
dtype
,
)
self
.
cache_events
=
self
.
cache_engine
.
events
self
.
cache_events
=
self
.
cache_engine
.
events
self
.
gpu_cache
=
self
.
cache_engine
.
gpu_cache
self
.
gpu_cache
=
self
.
cache_engine
.
gpu_cache
def
init_distributed_environment
(
self
,
def
_prepare_inputs
(
distributed_init_method
:
str
,
rank
:
int
,
world_size
:
int
,
tensor_parallel_size
:
int
=
1
,
pipeline_parallel_size
:
int
=
1
)
->
None
:
"""Initialize the distributed environment."""
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
distributed_init_method
,
world_size
=
world_size
,
rank
=
rank
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
initialize_model_parallel
(
tensor_parallel_size
,
pipeline_parallel_size
)
def
prepare_inputs
(
self
,
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
LongTensor
,
torch
.
LongTensor
,
InputMetadata
]:
)
->
Tuple
[
torch
.
LongTensor
,
torch
.
LongTensor
,
InputMetadata
]:
...
@@ -284,7 +246,7 @@ class Worker:
...
@@ -284,7 +246,7 @@ class Worker:
return
tokens_tensor
,
positions_tensor
,
input_metadata
return
tokens_tensor
,
positions_tensor
,
input_metadata
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
execute_
stage
(
def
execute_
model
(
self
,
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
...
@@ -316,7 +278,7 @@ class Worker:
...
@@ -316,7 +278,7 @@ class Worker:
return
{}
return
{}
# Prepare input tensors.
# Prepare input tensors.
input_tokens
,
input_positions
,
input_metadata
=
self
.
prepare_inputs
(
input_tokens
,
input_positions
,
input_metadata
=
self
.
_
prepare_inputs
(
seq_group_metadata_list
)
seq_group_metadata_list
)
# Execute the model.
# Execute the model.
...
@@ -330,6 +292,24 @@ class Worker:
...
@@ -330,6 +292,24 @@ class Worker:
return
output
return
output
def
_init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
distributed_init_method
:
str
,
)
->
None
:
"""Initialize the distributed environment."""
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
parallel_config
.
world_size
,
rank
=
rank
,
init_method
=
distributed_init_method
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
initialize_model_parallel
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
def
_pad_to_alignment
(
x
:
List
[
int
],
multiple_of
:
int
)
->
List
[
int
]:
def
_pad_to_alignment
(
x
:
List
[
int
],
multiple_of
:
int
)
->
List
[
int
]:
return
x
+
[
0
]
*
((
-
len
(
x
))
%
multiple_of
)
return
x
+
[
0
]
*
((
-
len
(
x
))
%
multiple_of
)
...
...
examples/simple_server.py
0 → 100644
View file @
c3442c1f
import
argparse
import
uuid
from
cacheflow
import
(
add_server_arguments
,
initialize_server_from_args
,
SamplingParams
)
def
main
(
args
:
argparse
.
Namespace
):
# Initialize the server.
server
=
initialize_server_from_args
(
args
)
# Test the following prompts.
test_prompts
=
[
(
"A robot may not injure a human being"
,
SamplingParams
()),
(
"To be or not to be,"
,
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
(
"What is the meaning of life?"
,
SamplingParams
(
n
=
2
,
temperature
=
0.8
,
top_p
=
0.95
,
frequency_penalty
=
0.1
)),
(
"It is only with the heart that one can see rightly"
,
SamplingParams
(
n
=
3
,
use_beam_search
=
True
,
temperature
=
0.0
)),
]
# Run the server.
while
True
:
# To test iteration-level scheduling, we add one request at each step.
if
test_prompts
:
prompt
,
sampling_params
=
test_prompts
.
pop
(
0
)
request_id
=
str
(
uuid
.
uuid4
().
hex
[:
8
])
server
.
add_request
(
request_id
,
prompt
,
sampling_params
)
request_outputs
=
server
.
step
()
for
request_output
in
request_outputs
:
if
request_output
.
done
:
print
(
request_output
)
if
not
(
server
.
has_unfinished_requests
()
or
test_prompts
):
break
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Simple CacheFlow server.'
)
parser
=
add_server_arguments
(
parser
)
args
=
parser
.
parse_args
()
main
(
args
)
simple_server.py
deleted
100644 → 0
View file @
7297fa6f
import
argparse
from
cacheflow.core.server
import
(
add_server_arguments
,
process_server_arguments
,
init_local_server_and_frontend_with_arguments
)
from
cacheflow.sampling_params
import
SamplingParams
def
main
(
args
:
argparse
.
Namespace
):
server
,
frontend
=
init_local_server_and_frontend_with_arguments
(
args
)
# Test the following inputs.
test_inputs
=
[
(
"A robot may not injure a human being"
,
{}),
# Use default parameters.
(
"To be or not to be,"
,
{
"temperature"
:
0.8
,
"top_k"
:
5
,
"presence_penalty"
:
0.2
}),
(
"What is the meaning of life?"
,
{
"n"
:
2
,
"temperature"
:
0.8
,
"top_p"
:
0.95
,
"frequency_penalty"
:
0.1
}),
(
"It is only with the heart that one can see rightly"
,
{
"n"
:
3
,
"use_beam_search"
:
True
,
"temperature"
:
0.0
}),
]
while
True
:
if
test_inputs
:
text
,
sampling_params_dict
=
test_inputs
.
pop
(
0
)
sampling_params
=
SamplingParams
(
**
sampling_params_dict
)
sampling_params
=
frontend
.
add_eos_token
(
sampling_params
)
frontend
.
query
(
text
,
sampling_params
)
server
.
add_sequence_groups
(
frontend
.
get_inputs
())
updated_seq_groups
=
server
.
step
()
for
seq_group
in
updated_seq_groups
:
if
seq_group
.
is_finished
():
frontend
.
print_response
(
seq_group
)
if
not
(
server
.
has_unfinished_requests
()
or
test_inputs
):
break
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'CacheFlow simple server.'
)
parser
=
add_server_arguments
(
parser
)
args
=
parser
.
parse_args
()
args
=
process_server_arguments
(
args
)
main
(
args
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment