Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
7c041ab5
Unverified
Commit
7c041ab5
authored
May 09, 2023
by
Woosuk Kwon
Committed by
GitHub
May 09, 2023
Browse files
Refactor system architecture (#82)
parent
8917782a
Changes
39
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
100 additions
and
92 deletions
+100
-92
cacheflow/core/block_manager.py
cacheflow/core/block_manager.py
+0
-0
cacheflow/core/policy.py
cacheflow/core/policy.py
+0
-0
cacheflow/core/scheduler.py
cacheflow/core/scheduler.py
+2
-2
cacheflow/core/server.py
cacheflow/core/server.py
+5
-4
cacheflow/frontend/fastapi_frontend.py
cacheflow/frontend/fastapi_frontend.py
+7
-7
cacheflow/frontend/simple_frontend.py
cacheflow/frontend/simple_frontend.py
+0
-0
cacheflow/model_executor/__init__.py
cacheflow/model_executor/__init__.py
+11
-0
cacheflow/model_executor/input_metadata.py
cacheflow/model_executor/input_metadata.py
+0
-0
cacheflow/model_executor/layers/activation.py
cacheflow/model_executor/layers/activation.py
+0
-0
cacheflow/model_executor/layers/attention.py
cacheflow/model_executor/layers/attention.py
+1
-1
cacheflow/model_executor/layers/layernorm.py
cacheflow/model_executor/layers/layernorm.py
+0
-0
cacheflow/model_executor/layers/sampler.py
cacheflow/model_executor/layers/sampler.py
+4
-3
cacheflow/model_executor/memory_analyzer.py
cacheflow/model_executor/memory_analyzer.py
+1
-1
cacheflow/model_executor/model_loader.py
cacheflow/model_executor/model_loader.py
+8
-11
cacheflow/model_executor/models/__init__.py
cacheflow/model_executor/models/__init__.py
+12
-0
cacheflow/model_executor/models/gpt2.py
cacheflow/model_executor/models/gpt2.py
+10
-14
cacheflow/model_executor/models/gpt_neox.py
cacheflow/model_executor/models/gpt_neox.py
+16
-19
cacheflow/model_executor/models/llama.py
cacheflow/model_executor/models/llama.py
+13
-16
cacheflow/model_executor/models/opt.py
cacheflow/model_executor/models/opt.py
+10
-14
cacheflow/model_executor/parallel_utils/README.md
cacheflow/model_executor/parallel_utils/README.md
+0
-0
No files found.
cacheflow/
master
/block_manager.py
→
cacheflow/
core
/block_manager.py
View file @
7c041ab5
File moved
cacheflow/
master
/policy.py
→
cacheflow/
core
/policy.py
View file @
7c041ab5
File moved
cacheflow/
master
/scheduler.py
→
cacheflow/
core
/scheduler.py
View file @
7c041ab5
...
...
@@ -4,8 +4,8 @@ import pickle
import
time
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
cacheflow.
master
.block_manager
import
BlockSpaceManager
from
cacheflow.
master
.policy
import
PolicyFactory
from
cacheflow.
core
.block_manager
import
BlockSpaceManager
from
cacheflow.
core
.policy
import
PolicyFactory
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
from
cacheflow.sequence
import
SequenceGroup
...
...
cacheflow/
master
/server.py
→
cacheflow/
core
/server.py
View file @
7c041ab5
...
...
@@ -8,20 +8,21 @@ try:
except
ImportError
:
ray
=
None
from
cacheflow.core.scheduler
import
Scheduler
from
cacheflow.frontend.simple_frontend
import
SimpleFrontend
from
cacheflow.logger
import
init_logger
from
cacheflow.master.scheduler
import
Scheduler
from
cacheflow.master.simple_frontend
import
SimpleFrontend
from
cacheflow.models
import
get_memory_analyzer
from
cacheflow.worker.controller
import
Controller
,
DeviceID
from
cacheflow.model_executor
import
get_memory_analyzer
from
cacheflow.sequence
import
SequenceGroup
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.utils
import
get_gpu_memory
,
get_cpu_memory
from
cacheflow.worker.controller
import
Controller
,
DeviceID
logger
=
init_logger
(
__name__
)
class
Server
:
def
__init__
(
self
,
model
:
str
,
...
...
cacheflow/
http_
frontend/fastapi_frontend.py
→
cacheflow/frontend/fastapi_frontend.py
View file @
7c041ab5
import
argparse
import
asyncio
import
json
import
time
from
typing
import
List
,
Dict
,
Optional
import
json
import
ray
from
transformers
import
AutoTokenizer
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
StreamingResponse
import
ray
from
transformers
import
AutoTokenizer
import
uvicorn
from
cacheflow.core.server
import
(
Server
,
add_server_arguments
,
process_server_arguments
,
initialize_cluster
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
Sequence
,
SequenceGroup
from
cacheflow.master.server
import
(
Server
,
add_server_arguments
,
process_server_arguments
,
initialize_cluster
)
from
cacheflow.worker.controller
import
DeviceID
from
cacheflow.utils
import
Counter
,
get_gpu_memory
,
get_cpu_memory
from
cacheflow.worker.controller
import
DeviceID
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds
app
=
FastAPI
()
...
...
cacheflow/
master
/simple_frontend.py
→
cacheflow/
frontend
/simple_frontend.py
View file @
7c041ab5
File moved
cacheflow/model_executor/__init__.py
0 → 100644
View file @
7c041ab5
from
cacheflow.model_executor.input_metadata
import
InputMetadata
from
cacheflow.model_executor.model_loader
import
get_model
,
get_memory_analyzer
from
cacheflow.model_executor.utils
import
set_random_seed
__all__
=
[
"InputMetadata"
,
"get_model"
,
"get_memory_analyzer"
,
"set_random_seed"
,
]
cacheflow/model
s
/input_metadata.py
→
cacheflow/model
_executor
/input_metadata.py
View file @
7c041ab5
File moved
cacheflow/models/activation.py
→
cacheflow/model
_executor/layer
s/activation.py
View file @
7c041ab5
File moved
cacheflow/models/attention.py
→
cacheflow/model
_executor/layer
s/attention.py
View file @
7c041ab5
...
...
@@ -7,7 +7,7 @@ from xformers import ops as xops
from
cacheflow
import
attention_ops
from
cacheflow
import
cache_ops
from
cacheflow
import
pos_encoding_ops
from
cacheflow.model
s
import
InputMetadata
from
cacheflow.model
_executor.input_metadata
import
InputMetadata
class
GPTCacheFlowAttention
(
nn
.
Module
):
...
...
cacheflow/models/layernorm.py
→
cacheflow/model
_executor/layer
s/layernorm.py
View file @
7c041ab5
File moved
cacheflow/models/sample.py
→
cacheflow/model
_executor/layer
s/sample
r
.py
View file @
7c041ab5
...
...
@@ -3,10 +3,11 @@ from typing import Dict, List, Tuple
import
torch
import
torch.nn
as
nn
from
cacheflow.models
import
InputMetadata
from
cacheflow.model_executor.input_metadata
import
InputMetadata
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
(
gather_from_tensor_model_parallel_region
)
from
cacheflow.sampling_params
import
SamplingParams
from
cacheflow.sequence
import
SequenceOutputs
from
cacheflow.parallel_utils.tensor_parallel
import
gather_from_tensor_model_parallel_region
class
Sampler
(
nn
.
Module
):
...
...
@@ -27,7 +28,7 @@ class Sampler(nn.Module):
# Get the logits for the next tokens.
logits
=
torch
.
matmul
(
hidden_states
,
embedding
.
t
())
logits
=
gather_from_tensor_model_parallel_region
(
logits
)
# Remove paddings in vocab.
# Remove paddings in vocab
(if any)
.
logits
=
logits
[:,
:
self
.
vocab_size
]
# Apply temperature scaling.
...
...
cacheflow/model
s
/memory_analyzer.py
→
cacheflow/model
_executor
/memory_analyzer.py
View file @
7c041ab5
...
...
@@ -2,7 +2,7 @@ import torch
from
transformers
import
AutoConfig
from
cacheflow.logger
import
init_logger
from
cacheflow.model
s
.utils
import
get_dtype_size
from
cacheflow.model
_executor
.utils
import
get_dtype_size
logger
=
init_logger
(
__name__
)
...
...
cacheflow/model
s
/model_
utils
.py
→
cacheflow/model
_executor
/model_
loader
.py
View file @
7c041ab5
...
...
@@ -5,16 +5,13 @@ import torch.nn as nn
from
transformers
import
AutoConfig
from
transformers
import
PretrainedConfig
from
cacheflow.models.memory_analyzer
import
CacheFlowMemoryAnalyzer
from
cacheflow.models.memory_analyzer
import
GPT2MemoryAnalyzer
from
cacheflow.models.memory_analyzer
import
GPTNeoXMemoryAnalyzer
from
cacheflow.models.memory_analyzer
import
LlamaMemoryAnalyzer
from
cacheflow.models.memory_analyzer
import
OPTMemoryAnalyzer
from
cacheflow.models.gpt2
import
GPT2LMHeadModel
from
cacheflow.models.gpt_neox
import
GPTNeoXForCausalLM
from
cacheflow.models.llama
import
LlamaForCausalLM
from
cacheflow.models.opt
import
OPTForCausalLM
from
cacheflow.models.utils
import
get_torch_dtype
from
cacheflow.model_executor.memory_analyzer
import
(
CacheFlowMemoryAnalyzer
,
GPT2MemoryAnalyzer
,
GPTNeoXMemoryAnalyzer
,
LlamaMemoryAnalyzer
,
OPTMemoryAnalyzer
)
from
cacheflow.model_executor.models
import
(
GPT2LMHeadModel
,
GPTNeoXForCausalLM
,
LlamaForCausalLM
,
OPTForCausalLM
)
from
cacheflow.model_executor.utils
import
get_torch_dtype
from
cacheflow.model_executor.weight_utils
import
initialize_dummy_weights
_MODELS
=
{
...
...
@@ -77,7 +74,7 @@ def get_model(
model
=
model
.
cuda
()
# NOTE(woosuk): For precise performance evaluation, we assign
# random values to the weights.
model
.
initialize_dummy_weights
()
initialize_dummy_weights
(
model
)
else
:
# Create a model instance.
model
=
model_class
(
config
)
...
...
cacheflow/model_executor/models/__init__.py
0 → 100644
View file @
7c041ab5
from
cacheflow.model_executor.models.gpt_neox
import
GPTNeoXForCausalLM
from
cacheflow.model_executor.models.gpt2
import
GPT2LMHeadModel
from
cacheflow.model_executor.models.llama
import
LlamaForCausalLM
from
cacheflow.model_executor.models.opt
import
OPTForCausalLM
__all__
=
[
"GPT2LMHeadModel"
,
"GPTNeoXForCausalLM"
,
"LlamaForCausalLM"
,
"OPTForCausalLM"
,
]
cacheflow/models/gpt2.py
→
cacheflow/
model_executor/
models/gpt2.py
View file @
7c041ab5
...
...
@@ -5,16 +5,15 @@ import torch
from
torch
import
nn
from
transformers
import
GPT2Config
from
cacheflow.model
s
import
InputMetadata
from
cacheflow.models.attention
import
GPTCacheFlowAttention
from
cacheflow.models.sample
import
Sampler
from
cacheflow.model
s.
utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.model
_executor.input_metadata
import
InputMetadata
from
cacheflow.model
_executor.layer
s.attention
import
GPTCacheFlowAttention
from
cacheflow.model
_executor.layer
s.sample
r
import
Sampler
from
cacheflow.model
_executor.weight_
utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.
model_executor.
parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
cacheflow.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.sequence
import
SequenceOutputs
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
...
...
@@ -258,8 +257,5 @@ class GPT2LMHeadModel(nn.Module):
raise
ValueError
(
f
"Unexpected parameter name
{
name
}
"
)
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
)
def
initialize_dummy_weights
(
self
)
->
None
:
for
param
in
self
.
state_dict
().
values
():
param
.
data
.
uniform_
(
-
1e-3
,
1e-3
)
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
cacheflow/models/gpt_neox.py
→
cacheflow/
model_executor/
models/gpt_neox.py
View file @
7c041ab5
...
...
@@ -3,17 +3,17 @@ from typing import Dict, List, Optional, Tuple
import
torch
from
torch
import
nn
from
cacheflow.models
import
InputMetadata
from
cacheflow.models.attention
import
GPTNeoXCacheFlowAttention
from
cacheflow.models.sample
import
Sampler
from
cacheflow.models.utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.parallel_utils.parallel_state
import
(
from
transformers
import
GPTNeoXConfig
from
cacheflow.model_executor.input_metadata
import
InputMetadata
from
cacheflow.model_executor.layers.attention
import
GPTNeoXCacheFlowAttention
from
cacheflow.model_executor.layers.sampler
import
Sampler
from
cacheflow.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
cacheflow.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.sequence
import
SequenceOutputs
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
...
...
@@ -21,7 +21,7 @@ KVCache = Tuple[torch.Tensor, torch.Tensor]
class
GPTNeoXAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
total_num_heads
=
config
.
num_attention_heads
self
.
hidden_size
=
config
.
hidden_size
...
...
@@ -63,7 +63,7 @@ class GPTNeoXAttention(nn.Module):
class
GPTNeoXMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
dense_h_to_4h
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
intermediate_size
,
...
...
@@ -86,7 +86,7 @@ class GPTNeoXMLP(nn.Module):
class
GPTNeoXLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
use_parallel_residual
=
config
.
use_parallel_residual
self
.
input_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
...
...
@@ -129,7 +129,7 @@ class GPTNeoXLayer(nn.Module):
class
GPTNeoXModel
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
config
=
config
...
...
@@ -227,8 +227,5 @@ class GPTNeoXForCausalLM(nn.Module):
raise
ValueError
(
f
"Unexpected weight name:
{
name
}
"
)
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
)
def
initialize_dummy_weights
(
self
)
->
None
:
for
param
in
self
.
state_dict
().
values
():
param
.
data
.
uniform_
(
-
1e-3
,
1e-3
)
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
cacheflow/models/llama.py
→
cacheflow/
model_executor/
models/llama.py
View file @
7c041ab5
...
...
@@ -5,18 +5,18 @@ import torch
from
torch
import
nn
from
transformers
import
LlamaConfig
from
cacheflow.models
import
InputMetadata
from
cacheflow.models.activation
import
SiluAndMul
from
cacheflow.models.attention
import
GPTNeoXCacheFlowAttention
from
cacheflow.models.layernorm
import
RMSNorm
from
cacheflow.models.sample
import
Sampler
from
cacheflow.models.utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.sequence
import
SequenceOutputs
from
cacheflow.model_executor.input_metadata
import
InputMetadata
from
cacheflow.model_executor.layers.activation
import
SiluAndMul
from
cacheflow.model_executor.layers.layernorm
import
RMSNorm
from
cacheflow.model_executor.layers.attention
import
GPTNeoXCacheFlowAttention
from
cacheflow.model_executor.layers.sampler
import
Sampler
from
cacheflow.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
cacheflow.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.sequence
import
SequenceOutputs
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
...
...
@@ -263,8 +263,5 @@ class LlamaForCausalLM(nn.Module):
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
)
def
initialize_dummy_weights
(
self
)
->
None
:
for
param
in
self
.
state_dict
().
values
():
param
.
data
.
uniform_
(
-
1e-3
,
1e-3
)
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
cacheflow/models/opt.py
→
cacheflow/
model_executor/
models/opt.py
View file @
7c041ab5
...
...
@@ -5,16 +5,15 @@ import torch
from
torch
import
nn
from
transformers
import
OPTConfig
from
cacheflow.model
s
import
InputMetadata
from
cacheflow.models.attention
import
GPTCacheFlowAttention
from
cacheflow.models.sample
import
Sampler
from
cacheflow.model
s.
utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.parallel_utils.parallel_state
import
(
from
cacheflow.model
_executor.input_metadata
import
InputMetadata
from
cacheflow.model
_executor.layer
s.attention
import
GPTCacheFlowAttention
from
cacheflow.model
_executor.layer
s.sample
r
import
Sampler
from
cacheflow.model
_executor.weight_
utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
cacheflow.
model_executor.
parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
cacheflow.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.model_executor.parallel_utils.tensor_parallel
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
cacheflow.sequence
import
SequenceOutputs
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
...
...
@@ -288,8 +287,5 @@ class OPTForCausalLM(nn.Module):
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
)
def
initialize_dummy_weights
(
self
)
->
None
:
for
param
in
self
.
state_dict
().
values
():
param
.
data
.
uniform_
(
-
1e-3
,
1e-3
)
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
cacheflow/parallel_utils/README.md
→
cacheflow/
model_executor/
parallel_utils/README.md
View file @
7c041ab5
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment