Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
1b0bd0fe
Unverified
Commit
1b0bd0fe
authored
Aug 02, 2023
by
Zhuohan Li
Committed by
GitHub
Aug 02, 2023
Browse files
Add Falcon support (new) (#592)
parent
20044cab
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
680 additions
and
122 deletions
+680
-122
README.md
README.md
+1
-0
csrc/pos_encoding_kernels.cu
csrc/pos_encoding_kernels.cu
+29
-13
docs/source/models/supported_models.rst
docs/source/models/supported_models.rst
+3
-0
examples/llm_engine_example.py
examples/llm_engine_example.py
+2
-1
vllm/config.py
vllm/config.py
+7
-2
vllm/model_executor/layers/attention.py
vllm/model_executor/layers/attention.py
+24
-12
vllm/model_executor/model_loader.py
vllm/model_executor/model_loader.py
+2
-0
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+2
-0
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+496
-0
vllm/model_executor/parallel_utils/parallel_state.py
vllm/model_executor/parallel_utils/parallel_state.py
+1
-72
vllm/model_executor/parallel_utils/tensor_parallel/__init__.py
...model_executor/parallel_utils/tensor_parallel/__init__.py
+2
-1
vllm/model_executor/parallel_utils/tensor_parallel/layers.py
vllm/model_executor/parallel_utils/tensor_parallel/layers.py
+16
-15
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+2
-0
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+5
-0
vllm/transformers_utils/configs/falcon.py
vllm/transformers_utils/configs/falcon.py
+87
-0
vllm/worker/worker.py
vllm/worker/worker.py
+1
-6
No files found.
README.md
View file @
1b0bd0fe
...
...
@@ -44,6 +44,7 @@ vLLM seamlessly supports many Huggingface models, including the following archit
-
Baichuan-7B (
`baichuan-inc/Baichuan-7B`
)
-
BLOOM (
`bigscience/bloom`
,
`bigscience/bloomz`
, etc.)
-
Falcon (
`tiiuae/falcon-7b`
,
`tiiuae/falcon-40b`
,
`tiiuae/falcon-rw-7b`
, etc.)
-
GPT-2 (
`gpt2`
,
`gpt2-xl`
, etc.)
-
GPT BigCode (
`bigcode/starcoder`
,
`bigcode/gpt_bigcode-santacoder`
, etc.)
-
GPT-J (
`EleutherAI/gpt-j-6b`
,
`nomic-ai/gpt4all-j`
, etc.)
...
...
csrc/pos_encoding_kernels.cu
View file @
1b0bd0fe
...
...
@@ -10,7 +10,8 @@ __global__ void rotary_embedding_neox_kernel(
scalar_t
*
__restrict__
key
,
// [num_tokens, num_kv_heads, head_size]
const
scalar_t
*
__restrict__
cos_sin_cache
,
// [max_position, 2, rot_dim // 2]
const
int
rot_dim
,
const
int
stride
,
const
int
query_stride
,
const
int
key_stride
,
const
int
num_heads
,
const
int
num_kv_heads
,
const
int
head_size
)
{
...
...
@@ -23,14 +24,14 @@ __global__ void rotary_embedding_neox_kernel(
const
int
nq
=
num_heads
*
embed_dim
;
for
(
int
i
=
threadIdx
.
x
;
i
<
nq
;
i
+=
blockDim
.
x
)
{
const
int
head_idx
=
i
/
embed_dim
;
const
int
token_head
=
token_idx
*
stride
+
head_idx
*
head_size
;
const
int
token_head
=
token_idx
*
query_
stride
+
head_idx
*
head_size
;
const
int
rot_offset
=
i
%
embed_dim
;
const
int
x_index
=
rot_offset
;
const
int
y_index
=
embed_dim
+
rot_offset
;
const
int
out_x
=
token_idx
*
stride
+
head_idx
*
head_size
+
x_index
;
const
int
out_y
=
token_idx
*
stride
+
head_idx
*
head_size
+
y_index
;
const
int
out_x
=
token_idx
*
query_
stride
+
head_idx
*
head_size
+
x_index
;
const
int
out_y
=
token_idx
*
query_
stride
+
head_idx
*
head_size
+
y_index
;
const
scalar_t
cos
=
__ldg
(
cache_ptr
+
x_index
);
const
scalar_t
sin
=
__ldg
(
cache_ptr
+
y_index
);
...
...
@@ -39,13 +40,27 @@ __global__ void rotary_embedding_neox_kernel(
const
scalar_t
q_y
=
query
[
token_head
+
y_index
];
query
[
out_x
]
=
q_x
*
cos
-
q_y
*
sin
;
query
[
out_y
]
=
q_y
*
cos
+
q_x
*
sin
;
}
const
int
nk
=
num_kv_heads
*
embed_dim
;
for
(
int
i
=
threadIdx
.
x
;
i
<
nk
;
i
+=
blockDim
.
x
)
{
const
int
head_idx
=
i
/
embed_dim
;
const
int
token_head
=
token_idx
*
key_stride
+
head_idx
*
head_size
;
const
int
rot_offset
=
i
%
embed_dim
;
const
int
x_index
=
rot_offset
;
const
int
y_index
=
embed_dim
+
rot_offset
;
const
int
out_x
=
token_idx
*
key_stride
+
head_idx
*
head_size
+
x_index
;
const
int
out_y
=
token_idx
*
key_stride
+
head_idx
*
head_size
+
y_index
;
const
scalar_t
cos
=
__ldg
(
cache_ptr
+
x_index
);
const
scalar_t
sin
=
__ldg
(
cache_ptr
+
y_index
);
if
(
head_idx
<
num_kv_heads
)
{
const
scalar_t
k_x
=
key
[
token_head
+
x_index
];
const
scalar_t
k_y
=
key
[
token_head
+
y_index
];
key
[
out_x
]
=
k_x
*
cos
-
k_y
*
sin
;
key
[
out_y
]
=
k_y
*
cos
+
k_x
*
sin
;
}
const
scalar_t
k_x
=
key
[
token_head
+
x_index
];
const
scalar_t
k_y
=
key
[
token_head
+
y_index
];
key
[
out_x
]
=
k_x
*
cos
-
k_y
*
sin
;
key
[
out_y
]
=
k_y
*
cos
+
k_x
*
sin
;
}
}
...
...
@@ -62,8 +77,8 @@ void rotary_embedding_neox(
int
rot_dim
=
cos_sin_cache
.
size
(
1
);
int
num_heads
=
query
.
size
(
1
)
/
head_size
;
int
num_kv_heads
=
key
.
size
(
1
)
/
head_size
;
int
stride
=
query
.
stride
(
0
);
TORCH_CHECK
(
stride
=
=
key
.
stride
(
0
)
)
;
int
query_
stride
=
query
.
stride
(
0
);
int
key_
stride
=
key
.
stride
(
0
);
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
num_heads
*
rot_dim
/
2
,
512
));
...
...
@@ -80,7 +95,8 @@ void rotary_embedding_neox(
key
.
data_ptr
<
scalar_t
>
(),
cos_sin_cache
.
data_ptr
<
scalar_t
>
(),
rot_dim
,
stride
,
query_stride
,
key_stride
,
num_heads
,
num_kv_heads
,
head_size
);
...
...
docs/source/models/supported_models.rst
View file @
1b0bd0fe
...
...
@@ -20,6 +20,9 @@ Alongside each architecture, we include some popular models that use it.
* - :code:`BloomForCausalLM`
- BLOOM, BLOOMZ, BLOOMChat
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
* - :code:`FalconForCausalLM`
- Falcon
- :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
* - :code:`GPT2LMHeadModel`
- GPT-2
- :code:`gpt2`, :code:`gpt2-xl`, etc.
...
...
examples/llm_engine_example.py
View file @
1b0bd0fe
...
...
@@ -10,7 +10,8 @@ def main(args: argparse.Namespace):
# Test the following prompts.
test_prompts
=
[
(
"A robot may not injure a human being"
,
SamplingParams
()),
(
"A robot may not injure a human being"
,
SamplingParams
(
temperature
=
0.0
)),
(
"To be or not to be,"
,
SamplingParams
(
temperature
=
0.8
,
top_k
=
5
,
presence_penalty
=
0.2
)),
(
"What is the meaning of life?"
,
...
...
vllm/config.py
View file @
1b0bd0fe
...
...
@@ -94,8 +94,13 @@ class ModelConfig:
return
self
.
hf_config
.
hidden_size
//
self
.
hf_config
.
num_attention_heads
def
get_num_heads
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
# For GPTBigCode:
if
getattr
(
self
.
hf_config
,
"multi_query"
,
False
):
# For GPTBigCode & Falcon:
# Note: for falcon, when new_decoder_architecture is True, the
# multi_query flag is ignored and we use n_head_kv for the number of
# KV heads.
if
(
getattr
(
self
.
hf_config
,
"multi_query"
,
False
)
and
(
self
.
hf_config
.
model_type
==
"falcon"
and
not
getattr
(
self
.
hf_config
,
"new_decoder_architecture"
,
False
))):
# Multi-query attention, only one KV head.
return
1
# For Falcon:
...
...
vllm/model_executor/layers/attention.py
View file @
1b0bd0fe
...
...
@@ -314,14 +314,13 @@ class PagedAttentionWithRoPE(PagedAttention):
class
PagedAttentionWithALiBi
(
PagedAttention
):
"""PagedAttention with ALiBi attention bias."""
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
slopes
:
List
[
float
],
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
)
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
slopes
:
List
[
float
],
num_kv_heads
:
Optional
[
int
]
=
None
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
)
assert
len
(
slopes
)
==
num_heads
slopes
=
torch
.
tensor
(
slopes
,
dtype
=
torch
.
float32
)
...
...
@@ -334,6 +333,11 @@ class PagedAttentionWithALiBi(PagedAttention):
# Generates ALiBi mask for each prompt.
for
prompt_len
in
input_metadata
.
prompt_lens
:
bias
=
torch
.
arange
(
prompt_len
)
# Note(zhuohan): HF uses
# `bias = bias[None, :].repeat(prompt_len, 1)`
# here. We find that both biases give the same results, but
# the bias below more accurately follows the original ALiBi
# paper.
bias
=
bias
[
None
,
:]
-
bias
[:,
None
]
bias
=
bias
.
to
(
self
.
alibi_slopes
.
device
)
...
...
@@ -363,10 +367,17 @@ class PagedAttentionWithALiBi(PagedAttention):
Args:
output: shape = [num_prompt_tokens, num_heads, head_size]
query: shape = [num_prompt_tokens, num_heads, head_size]
key: shape = [num_prompt_tokens, num_heads, head_size]
value: shape = [num_prompt_tokens, num_heads, head_size]
key: shape = [num_prompt_tokens, num_
kv_
heads, head_size]
value: shape = [num_prompt_tokens, num_
kv_
heads, head_size]
input_metadata: metadata for paged attention.
"""
if
self
.
num_kv_heads
!=
self
.
num_heads
:
# Project the key and value tensors to the desired number of heads.
key
=
torch
.
repeat_interleave
(
key
,
self
.
num_queries_per_kv
,
dim
=
1
)
value
=
torch
.
repeat_interleave
(
value
,
self
.
num_queries_per_kv
,
dim
=
1
)
# FIXME(woosuk): Because xformers does not support dynamic sequence
# lengths with custom attention bias, we process each prompt one by
# one. This is inefficient, especially when we have many short prompts.
...
...
@@ -400,9 +411,10 @@ class PagedAttentionWithALiBi(PagedAttention):
Args:
output: shape = [num_generation_tokens, num_heads, head_size]
query: shape = [num_generation_tokens, num_heads, head_size]
key_cache: shape = [num_blocks, num_heads, head_size/x,
key_cache: shape = [num_blocks, num_
kv_
heads, head_size/x,
block_size, x]
value_cache: shape = [num_blocks, num_heads, head_size, block_size]
value_cache: shape = [num_blocks, num_kv_heads, head_size,
block_size]
input_metadata: metadata for paged attention.
"""
block_size
=
value_cache
.
shape
[
3
]
...
...
vllm/model_executor/model_loader.py
View file @
1b0bd0fe
...
...
@@ -14,6 +14,7 @@ _MODEL_REGISTRY = {
"BaiChuanForCausalLM"
:
BaiChuanForCausalLM
,
# baichuan-7b
"BaichuanForCausalLM"
:
BaichuanForCausalLM
,
# baichuan-13b
"BloomForCausalLM"
:
BloomForCausalLM
,
"FalconForCausalLM"
:
FalconForCausalLM
,
"GPT2LMHeadModel"
:
GPT2LMHeadModel
,
"GPTBigCodeForCausalLM"
:
GPTBigCodeForCausalLM
,
"GPTJForCausalLM"
:
GPTJForCausalLM
,
...
...
@@ -22,6 +23,7 @@ _MODEL_REGISTRY = {
"LLaMAForCausalLM"
:
LlamaForCausalLM
,
# For decapoda-research/llama-*
"MPTForCausalLM"
:
MPTForCausalLM
,
"OPTForCausalLM"
:
OPTForCausalLM
,
"RWForCausalLM"
:
FalconForCausalLM
,
}
...
...
vllm/model_executor/models/__init__.py
View file @
1b0bd0fe
from
vllm.model_executor.models.baichuan
import
BaiChuanForCausalLM
,
BaichuanForCausalLM
from
vllm.model_executor.models.bloom
import
BloomForCausalLM
from
vllm.model_executor.models.falcon
import
FalconForCausalLM
from
vllm.model_executor.models.gpt2
import
GPT2LMHeadModel
from
vllm.model_executor.models.gpt_bigcode
import
GPTBigCodeForCausalLM
from
vllm.model_executor.models.gpt_j
import
GPTJForCausalLM
...
...
@@ -12,6 +13,7 @@ __all__ = [
"BaiChuanForCausalLM"
,
"BaichuanForCausalLM"
,
"BloomForCausalLM"
,
"FalconForCausalLM"
,
"GPT2LMHeadModel"
,
"GPTBigCodeForCausalLM"
,
"GPTJForCausalLM"
,
...
...
vllm/model_executor/models/falcon.py
0 → 100644
View file @
1b0bd0fe
This diff is collapsed.
Click to expand it.
vllm/model_executor/parallel_utils/parallel_state.py
View file @
1b0bd0fe
...
...
@@ -44,7 +44,6 @@ _PIPELINE_GLOBAL_RANKS = None
# rank when broadcasting weights from src to all other data parallel ranks
_DATA_PARALLEL_GLOBAL_RANKS
=
None
_ALL_REDUCE_LAUNCHER
:
Optional
[
'GraphAllReduce'
]
=
None
def
initialize_model_parallel
(
tensor_model_parallel_size
:
int
=
1
,
...
...
@@ -196,20 +195,6 @@ def initialize_model_parallel(
if
rank
in
ranks
:
_POSITION_EMBEDDING_GLOBAL_RANKS
=
position_embedding_ranks
def
initialize_all_reduce_launcher
(
max_num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
disable_graph
:
bool
=
False
,
)
->
None
:
global
_ALL_REDUCE_LAUNCHER
_ALL_REDUCE_LAUNCHER
=
GraphAllReduce
(
max_num_tokens
=
max_num_tokens
,
hidden_size
=
hidden_size
,
dtype
=
dtype
,
disable_graph
=
disable_graph
,
)
def
model_parallel_is_initialized
():
"""Check if model and data parallel groups are initialized."""
if
_TENSOR_MODEL_PARALLEL_GROUP
is
None
or
\
...
...
@@ -458,6 +443,7 @@ def get_pipeline_model_parallel_last_rank():
last_rank_local
=
get_pipeline_model_parallel_world_size
()
-
1
return
_PIPELINE_GLOBAL_RANKS
[
last_rank_local
]
def
get_pipeline_model_parallel_next_rank
():
"""Return the global rank that follows the caller in the pipeline"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
\
...
...
@@ -485,10 +471,6 @@ def get_data_parallel_rank():
"""Return my rank for the data parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_data_parallel_group
())
def
get_all_reduce_launcher
()
->
'GraphAllReduce'
:
assert
_ALL_REDUCE_LAUNCHER
is
not
None
,
'all reduce launcher is not initialized'
return
_ALL_REDUCE_LAUNCHER
def
destroy_model_parallel
():
"""Set the groups to none."""
global
_MODEL_PARALLEL_GROUP
...
...
@@ -515,56 +497,3 @@ def destroy_model_parallel():
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
None
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
None
class
GraphAllReduce
:
def
__init__
(
self
,
max_num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
disable_graph
:
bool
=
False
,
)
->
None
:
self
.
max_num_tokens
=
max_num_tokens
self
.
hidden_size
=
hidden_size
self
.
disable_graph
=
disable_graph
tp_world_size
=
get_tensor_model_parallel_world_size
()
if
tp_world_size
==
1
:
return
self
.
group
=
get_tensor_model_parallel_group
()
self
.
buffer
=
torch
.
empty
(
size
=
(
max_num_tokens
,
hidden_size
),
dtype
=
dtype
,
device
=
'cuda'
,
)
# Build graphs for different number of tokens.
if
not
self
.
disable_graph
:
self
.
graphs
=
{}
for
num_tokens
in
range
(
8
,
max_num_tokens
+
1
,
8
):
self
.
graphs
[
num_tokens
]
=
self
.
_build_graph
(
num_tokens
)
def
_build_graph
(
self
,
num_tokens
:
int
)
->
torch
.
cuda
.
CUDAGraph
:
# Warm up.
torch
.
distributed
.
all_reduce
(
self
.
buffer
[:
num_tokens
],
group
=
self
.
group
)
torch
.
cuda
.
synchronize
()
# Build graph.
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
):
torch
.
distributed
.
all_reduce
(
self
.
buffer
[:
num_tokens
],
group
=
self
.
group
)
torch
.
cuda
.
synchronize
()
return
graph
def
launch
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# NOTE: x must be a slice of self.buffer.
num_tokens
=
x
.
shape
[
0
]
if
self
.
disable_graph
:
torch
.
distributed
.
all_reduce
(
x
,
group
=
self
.
group
)
else
:
self
.
graphs
[
num_tokens
].
replay
()
return
x
vllm/model_executor/parallel_utils/tensor_parallel/__init__.py
View file @
1b0bd0fe
...
...
@@ -12,6 +12,7 @@ from .mappings import (
copy_to_tensor_model_parallel_region
,
gather_from_tensor_model_parallel_region
,
gather_from_sequence_parallel_region
,
reduce_from_tensor_model_parallel_region
,
scatter_to_tensor_model_parallel_region
,
scatter_to_sequence_parallel_region
,
)
...
...
@@ -38,7 +39,7 @@ __all__ = [
"copy_to_tensor_model_parallel_region"
,
"gather_from_tensor_model_parallel_region"
,
"gather_from_sequence_parallel_region"
,
#
"reduce_from_tensor_model_parallel_region",
"reduce_from_tensor_model_parallel_region"
,
"scatter_to_tensor_model_parallel_region"
,
"scatter_to_sequence_parallel_region"
,
# random.py
...
...
vllm/model_executor/parallel_utils/tensor_parallel/layers.py
View file @
1b0bd0fe
...
...
@@ -14,7 +14,6 @@ from torch.nn.parameter import Parameter
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_all_reduce_launcher
,
)
from
.mappings
import
(
copy_to_tensor_model_parallel_region
,
...
...
@@ -248,8 +247,8 @@ class ColumnParallelLinear(torch.nn.Module):
self
.
output_size
=
output_size
self
.
gather_output
=
gather_output
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
output_size_per_partition
=
divide
(
output_size
,
world_size
)
self
.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
output_size_per_partition
=
divide
(
output_size
,
self
.
world_size
)
self
.
skip_bias_add
=
skip_bias_add
if
params_dtype
is
None
:
...
...
@@ -350,6 +349,7 @@ class RowParallelLinear(torch.nn.Module):
params_dtype:
use_cpu_initialization:
perform_initialization:
reduce_results:
"""
def
__init__
(
self
,
input_size
,
output_size
,
*
,
...
...
@@ -360,6 +360,7 @@ class RowParallelLinear(torch.nn.Module):
params_dtype
=
None
,
use_cpu_initialization
=
False
,
perform_initialization
=
True
,
reduce_results
=
True
,
):
super
(
RowParallelLinear
,
self
).
__init__
()
...
...
@@ -367,14 +368,19 @@ class RowParallelLinear(torch.nn.Module):
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
input_is_parallel
=
input_is_parallel
self
.
reduce_results
=
reduce_results
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
input_size_per_partition
=
divide
(
input_size
,
world_size
)
self
.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
input_size_per_partition
=
divide
(
input_size
,
self
.
world_size
)
self
.
skip_bias_add
=
skip_bias_add
if
not
reduce_results
and
(
bias
and
not
skip_bias_add
):
raise
ValueError
(
"When not reduce the results, adding bias to the "
"results can lead to incorrect results"
)
# Parameters.
# Note: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
...
...
@@ -427,17 +433,12 @@ class RowParallelLinear(torch.nn.Module):
input_parallel
=
input_
else
:
input_parallel
=
scatter_to_tensor_model_parallel_region
(
input_
)
if
get_tensor_model_parallel_world_size
()
==
1
:
# Matrix multiply.
output_
=
F
.
linear
(
input_parallel
,
self
.
weight
)
# Matrix multiply.
output_parallel
=
F
.
linear
(
input_parallel
,
self
.
weight
)
if
self
.
reduce_results
and
self
.
world_size
>
1
:
output_
=
reduce_from_tensor_model_parallel_region
(
output_parallel
)
else
:
# Matrix multiply.
all_reduce_launcher
=
get_all_reduce_launcher
()
num_tokens
=
input_parallel
.
shape
[
0
]
output_buffer
=
all_reduce_launcher
.
buffer
[:
num_tokens
]
torch
.
matmul
(
input_parallel
,
self
.
weight_t
,
out
=
output_buffer
)
# All-reduce across all the partitions.
output_
=
all_reduce_launcher
.
launch
(
output_buffer
)
output_
=
output_parallel
if
not
self
.
skip_bias_add
:
output
=
output_
+
self
.
bias
if
self
.
bias
is
not
None
else
output_
...
...
vllm/transformers_utils/config.py
View file @
1b0bd0fe
...
...
@@ -5,6 +5,8 @@ from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import
_CONFIG_REGISTRY
=
{
"mpt"
:
MPTConfig
,
"baichuan"
:
BaiChuanConfig
,
"RefinedWeb"
:
RWConfig
,
# For tiiuae/falcon-40b(-instruct)
"RefinedWebModel"
:
RWConfig
,
# For tiiuae/falcon-7b(-instruct)
}
...
...
vllm/transformers_utils/configs/__init__.py
View file @
1b0bd0fe
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
__all__
=
[
"MPTConfig"
,
"BaiChuanConfig"
,
"RWConfig"
,
]
vllm/transformers_utils/configs/falcon.py
0 → 100644
View file @
1b0bd0fe
# Adapted from
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
# Copyright 2023 The vLLM team.
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
class
RWConfig
(
PretrainedConfig
):
model_type
=
"falcon"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_hidden_layers"
:
"n_layer"
,
"num_attention_heads"
:
"n_head"
,
"num_kv_heads"
:
"n_head_kv"
,
}
def
__init__
(
self
,
vocab_size
=
250880
,
hidden_size
=
64
,
n_layer
=
2
,
n_head
=
8
,
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
use_cache
=
True
,
bos_token_id
=
1
,
eos_token_id
=
2
,
hidden_dropout
=
0.0
,
attention_dropout
=
0.0
,
multi_query
=
True
,
n_head_kv
=
None
,
alibi
=
False
,
bias
=
False
,
parallel_attn
=
False
,
new_decoder_architecture
=
False
,
**
kwargs
,
)
->
None
:
self
.
vocab_size
=
vocab_size
# Backward compatibility with n_embed kwarg
n_embed
=
kwargs
.
pop
(
"n_embed"
,
None
)
self
.
hidden_size
=
hidden_size
if
n_embed
is
None
else
n_embed
self
.
n_layer
=
n_layer
self
.
n_head
=
n_head
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
use_cache
=
use_cache
self
.
hidden_dropout
=
hidden_dropout
self
.
attention_dropout
=
attention_dropout
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
multi_query
=
multi_query
self
.
n_head_kv
=
1
if
n_head_kv
is
None
else
n_head_kv
self
.
alibi
=
alibi
self
.
bias
=
bias
self
.
parallel_attn
=
parallel_attn
self
.
new_decoder_architecture
=
new_decoder_architecture
if
self
.
hidden_size
==
8192
:
# Hack for falcon-40b
self
.
new_decoder_architecture
=
True
super
().
__init__
(
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
**
kwargs
)
@
property
def
head_dim
(
self
):
return
self
.
hidden_size
//
self
.
n_head
@
property
def
rotary
(
self
):
return
not
self
.
alibi
vllm/worker/worker.py
View file @
1b0bd0fe
...
...
@@ -9,7 +9,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig
)
from
vllm.model_executor
import
get_model
,
InputMetadata
,
set_random_seed
from
vllm.model_executor.parallel_utils.parallel_state
import
(
initialize_model_parallel
,
initialize_all_reduce_launcher
)
initialize_model_parallel
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
SequenceData
,
SequenceGroupMetadata
,
SequenceOutputs
from
vllm.worker.cache_engine
import
CacheEngine
...
...
@@ -65,11 +65,6 @@ class Worker:
# Initialize the model.
set_random_seed
(
self
.
model_config
.
seed
)
self
.
model
=
get_model
(
self
.
model_config
)
initialize_all_reduce_launcher
(
self
.
scheduler_config
.
max_num_batched_tokens
,
self
.
model_config
.
get_hidden_size
(),
self
.
model_config
.
dtype
,
)
@
torch
.
inference_mode
()
def
profile_num_available_blocks
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment