Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
e019635f
Commit
e019635f
authored
Nov 01, 2024
by
xuxzh1
🎱
Browse files
update
parent
64def8e2
Changes
171
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3249 deletions
+0
-3249
server/vllm/vllm/model_executor/models/llama.py
server/vllm/vllm/model_executor/models/llama.py
+0
-422
server/vllm/vllm/model_executor/models/mpt.py
server/vllm/vllm/model_executor/models/mpt.py
+0
-284
server/vllm/vllm/model_executor/models/opt.py
server/vllm/vllm/model_executor/models/opt.py
+0
-341
server/vllm/vllm/model_executor/models/qwen.py
server/vllm/vllm/model_executor/models/qwen.py
+0
-323
server/vllm/vllm/model_executor/parallel_utils/README.md
server/vllm/vllm/model_executor/parallel_utils/README.md
+0
-1
server/vllm/vllm/model_executor/parallel_utils/__init__.py
server/vllm/vllm/model_executor/parallel_utils/__init__.py
+0
-0
server/vllm/vllm/model_executor/parallel_utils/communication_op.py
...lm/vllm/model_executor/parallel_utils/communication_op.py
+0
-47
server/vllm/vllm/model_executor/parallel_utils/layers.py
server/vllm/vllm/model_executor/parallel_utils/layers.py
+0
-303
server/vllm/vllm/model_executor/parallel_utils/parallel_state.py
...vllm/vllm/model_executor/parallel_utils/parallel_state.py
+0
-179
server/vllm/vllm/model_executor/parallel_utils/utils.py
server/vllm/vllm/model_executor/parallel_utils/utils.py
+0
-70
server/vllm/vllm/model_executor/quantization_utils/__init__.py
...r/vllm/vllm/model_executor/quantization_utils/__init__.py
+0
-20
server/vllm/vllm/model_executor/quantization_utils/awq.py
server/vllm/vllm/model_executor/quantization_utils/awq.py
+0
-72
server/vllm/vllm/model_executor/quantization_utils/base.py
server/vllm/vllm/model_executor/quantization_utils/base.py
+0
-75
server/vllm/vllm/model_executor/utils.py
server/vllm/vllm/model_executor/utils.py
+0
-13
server/vllm/vllm/model_executor/weight_utils.py
server/vllm/vllm/model_executor/weight_utils.py
+0
-321
server/vllm/vllm/outputs.py
server/vllm/vllm/outputs.py
+0
-119
server/vllm/vllm/sampling_params.py
server/vllm/vllm/sampling_params.py
+0
-215
server/vllm/vllm/sequence.py
server/vllm/vllm/sequence.py
+0
-407
server/vllm/vllm/transformers_utils/__init__.py
server/vllm/vllm/transformers_utils/__init__.py
+0
-0
server/vllm/vllm/transformers_utils/config.py
server/vllm/vllm/transformers_utils/config.py
+0
-37
No files found.
server/vllm/vllm/model_executor/models/llama.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only LLaMA model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
LlamaConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.quantized_linear
import
ParallelLinear
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
VocabParallelEmbedding
from
vllm.model_executor.quantization_utils
import
QuantizationConfig
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_tensor_parallel_weights
,
load_padded_tensor_parallel_vocab
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
LlamaMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
gate_up_proj
=
ParallelLinear
.
column
(
hidden_size
,
2
*
intermediate_size
,
bias
=
False
,
gather_output
=
False
,
quant_config
=
quant_config
)
self
.
down_proj
=
ParallelLinear
.
row
(
intermediate_size
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
quant_config
=
quant_config
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
LlamaAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
if
self
.
total_num_kv_heads
>=
tp_size
:
# Number of KV heads is greater than TP size, so we partition
# the KV heads across multiple tensor parallel GPUs.
assert
self
.
total_num_kv_heads
%
tp_size
==
0
else
:
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
num_kv_heads_replicas
=
max
(
1
,
tp_size
//
self
.
total_num_kv_heads
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
ParallelLinear
.
column
(
hidden_size
,
(
self
.
total_num_heads
+
2
*
self
.
total_num_kv_heads
*
num_kv_heads_replicas
)
*
self
.
head_dim
,
bias
=
False
,
gather_output
=
False
,
quant_config
=
quant_config
,
)
self
.
o_proj
=
ParallelLinear
.
row
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
quant_config
=
quant_config
,
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
base
=
self
.
rope_theta
,
max_position
=
self
.
max_position_embeddings
,
rotary_dim
=
self
.
head_dim
,
num_kv_heads
=
self
.
num_kv_heads
,
rope_scaling
=
rope_scaling
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
LlamaDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
# Requires transformers > 4.32.0
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
LlamaAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
config
.
num_key_value_heads
,
rope_theta
=
rope_theta
,
rope_scaling
=
rope_scaling
,
max_position_embeddings
=
max_position_embeddings
,
quant_config
=
quant_config
,
)
self
.
mlp
=
LlamaMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
LlamaModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
embed_tokens
=
VocabParallelEmbedding
(
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
LlamaDecoderLayer
(
config
,
quant_config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
LlamaForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
LlamaModel
(
config
,
quant_config
)
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
# NOTE: The LM head is not quantized.
self
.
lm_head
=
ParallelLinear
.
column
(
config
.
hidden_size
,
vocab_size
,
bias
=
False
,
gather_output
=
False
,
quant_config
=
None
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_layers
=
[]
_row_parallel_layers
=
[
"o_proj"
,
"down_proj"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
if
self
.
quant_config
is
None
:
weight_suffixes
=
[
"weight"
]
else
:
weight_suffixes
=
self
.
quant_config
.
get_tp_tensor_names
()
column_parallel_weights
:
List
[
str
]
=
[]
for
layer
in
self
.
_column_parallel_layers
:
for
suffix
in
weight_suffixes
:
column_parallel_weights
.
append
(
f
"
{
layer
}
.
{
suffix
}
"
)
row_parallel_weights
:
List
[
str
]
=
[]
for
layer
in
self
.
_row_parallel_layers
:
for
suffix
in
weight_suffixes
:
row_parallel_weights
.
append
(
f
"
{
layer
}
.
{
suffix
}
"
)
tp_size
=
get_tensor_model_parallel_world_size
()
tp_rank
=
get_tensor_model_parallel_rank
()
q_proj_shard_size
=
(
self
.
config
.
hidden_size
//
tp_size
)
num_kv_heads_replicas
=
max
(
1
,
tp_size
//
self
.
config
.
num_key_value_heads
)
num_kv_heads_per_gpu
=
max
(
1
,
self
.
config
.
num_key_value_heads
//
tp_size
)
kv_proj_shard_size
=
(
self
.
config
.
hidden_size
//
self
.
config
.
num_attention_heads
*
num_kv_heads_per_gpu
)
attention_weight_specs
=
[
# (weight_name, shard_size, offset)
(
"q_proj"
,
q_proj_shard_size
,
0
),
(
"k_proj"
,
kv_proj_shard_size
,
q_proj_shard_size
),
(
"v_proj"
,
kv_proj_shard_size
,
q_proj_shard_size
+
kv_proj_shard_size
),
]
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
is_packed
=
False
is_transposed
=
False
if
self
.
quant_config
is
not
None
:
is_packed
=
self
.
quant_config
.
is_packed
(
name
)
is_transposed
=
self
.
quant_config
.
is_transposed
(
name
)
if
is_transposed
:
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
loaded_weight
=
loaded_weight
.
T
is_attention_weight
=
False
for
weight_name
,
shard_size
,
offset
in
attention_weight_specs
:
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"qkv_proj"
)]
if
is_transposed
:
param
=
param
.
T
if
is_packed
:
shard_size
//=
self
.
quant_config
.
pack_factor
offset
//=
self
.
quant_config
.
pack_factor
if
weight_name
in
[
"k_proj"
,
"v_proj"
]:
shard_id
=
tp_rank
//
num_kv_heads_replicas
else
:
shard_id
=
tp_rank
loaded_weight
=
loaded_weight
[
shard_size
*
shard_id
:
shard_size
*
(
shard_id
+
1
)]
param_slice
=
param
.
data
[
offset
:
offset
+
shard_size
]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_attention_weight
=
True
break
if
is_attention_weight
:
continue
is_gate_up_weight
=
False
for
stride_id
,
weight_name
in
enumerate
([
"gate_proj"
,
"up_proj"
]):
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"gate_up_proj"
)]
if
is_transposed
:
param
=
param
.
T
shard_size
=
param
.
shape
[
0
]
//
2
loaded_weight
=
loaded_weight
[
shard_size
*
tp_rank
:
shard_size
*
(
tp_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_gate_up_weight
=
True
break
if
is_gate_up_weight
:
continue
param
=
state_dict
[
name
]
if
is_transposed
:
param
=
param
.
T
if
"embed_tokens"
in
name
or
"lm_head"
in
name
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tp_rank
)
continue
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
column_parallel_weights
,
row_parallel_weights
,
tp_rank
)
server/vllm/vllm/model_executor/models/mpt.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
import
math
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttentionWithALiBi
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
def
_get_alibi_slopes
(
total_num_heads
:
int
,
alibi_bias_max
:
int
,
)
->
torch
.
Tensor
:
next_power_of_2
=
2
**
math
.
ceil
(
math
.
log2
(
total_num_heads
))
m
=
torch
.
arange
(
1
,
next_power_of_2
+
1
,
dtype
=
torch
.
float32
)
m
=
m
.
mul
(
alibi_bias_max
/
next_power_of_2
)
slopes
=
1.0
/
torch
.
pow
(
2
,
m
)
if
next_power_of_2
!=
total_num_heads
:
slopes
=
torch
.
concat
([
slopes
[
1
::
2
],
slopes
[::
2
]])[:
total_num_heads
]
return
slopes
class
MPTAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MPTConfig
):
super
().
__init__
()
self
.
d_model
=
config
.
d_model
self
.
total_num_heads
=
config
.
n_heads
self
.
clip_qkv
=
config
.
attn_config
[
"clip_qkv"
]
self
.
qk_ln
=
config
.
attn_config
[
"qk_ln"
]
self
.
alibi_bias_max
=
config
.
attn_config
[
"alibi_bias_max"
]
assert
not
config
.
attn_config
[
"prefix_lm"
]
assert
config
.
attn_config
[
"alibi"
]
self
.
qkv_proj
=
ColumnParallelLinear
(
self
.
d_model
,
3
*
self
.
d_model
,
bias
=
not
config
.
no_bias
,
gather_output
=
False
,
)
if
self
.
qk_ln
:
self
.
q_ln
=
nn
.
LayerNorm
(
self
.
d_model
)
self
.
k_ln
=
nn
.
LayerNorm
(
self
.
d_model
)
self
.
out_proj
=
RowParallelLinear
(
self
.
d_model
,
self
.
d_model
,
bias
=
not
config
.
no_bias
,
input_is_parallel
=
True
,
)
tp_world_size
=
get_tensor_model_parallel_world_size
()
assert
self
.
total_num_heads
%
tp_world_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_world_size
# Create the alibi slopes and slice them.
tp_rank
=
get_tensor_model_parallel_rank
()
head_start
=
tp_rank
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
alibi_slopes
=
_get_alibi_slopes
(
self
.
total_num_heads
,
self
.
alibi_bias_max
)
alibi_slopes
=
alibi_slopes
[
head_start
:
head_end
].
tolist
()
self
.
head_dim
=
self
.
d_model
//
self
.
total_num_heads
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
PagedAttentionWithALiBi
(
self
.
num_heads
,
self
.
head_dim
,
scaling
,
alibi_slopes
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
del
position_ids
# unused.
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
if
self
.
clip_qkv
is
not
None
:
qkv
.
clamp_
(
min
=-
self
.
clip_qkv
,
max
=
self
.
clip_qkv
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
if
self
.
qk_ln
:
q
=
self
.
q_ln
(
q
)
k
=
self
.
k_ln
(
k
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
out_proj
(
attn_output
)
return
output
class
MPTMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MPTConfig
):
super
().
__init__
()
hidden_size
=
config
.
d_model
expansion_ratio
=
config
.
expansion_ratio
intermediate_size
=
expansion_ratio
*
hidden_size
self
.
up_proj
=
ColumnParallelLinear
(
hidden_size
,
intermediate_size
,
bias
=
not
config
.
no_bias
,
gather_output
=
False
,
)
self
.
act
=
get_act_fn
(
"gelu"
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
not
config
.
no_bias
,
input_is_parallel
=
True
,
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
,
_
=
self
.
up_proj
(
x
)
x
=
self
.
act
(
x
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
MPTBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MPTConfig
):
super
().
__init__
()
hidden_size
=
config
.
d_model
self
.
norm_1
=
nn
.
LayerNorm
(
hidden_size
)
self
.
attn
=
MPTAttention
(
config
)
self
.
norm_2
=
nn
.
LayerNorm
(
hidden_size
)
self
.
ffn
=
MPTMLP
(
config
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
x
=
self
.
norm_1
(
hidden_states
)
x
=
self
.
attn
(
position_ids
=
position_ids
,
hidden_states
=
x
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
hidden_states
+
x
x
=
self
.
norm_2
(
hidden_states
)
x
=
self
.
ffn
(
x
)
hidden_states
=
hidden_states
+
x
return
hidden_states
class
MPTModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MPTConfig
):
super
().
__init__
()
assert
config
.
embedding_fraction
==
1.0
assert
config
.
norm_type
==
"low_precision_layernorm"
self
.
wte
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
d_model
,
)
self
.
blocks
=
nn
.
ModuleList
(
[
MPTBlock
(
config
)
for
_
in
range
(
config
.
n_layers
)])
self
.
norm_f
=
nn
.
LayerNorm
(
config
.
d_model
)
if
config
.
no_bias
:
for
module
in
self
.
modules
():
if
hasattr
(
module
,
"bias"
):
if
isinstance
(
module
.
bias
,
nn
.
Parameter
):
# Remove the bias term in Linear and LayerNorm.
module
.
register_parameter
(
"bias"
,
None
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
wte
(
input_ids
)
for
i
in
range
(
len
(
self
.
blocks
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
block
=
self
.
blocks
[
i
]
hidden_states
=
block
(
position_ids
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
norm_f
(
hidden_states
)
return
hidden_states
class
MPTForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
MPTConfig
):
super
().
__init__
()
self
.
config
=
config
assert
config
.
tie_word_embeddings
self
.
transformer
=
MPTModel
(
config
)
# TODO(zhuohan): create a new weight after implementing pipeline
# parallelism
self
.
lm_head_weight
=
self
.
transformer
.
wte
.
weight
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head_weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"wte.weight"
,
"up_proj.weight"
,
"up_proj.bias"
]
_row_parallel_weights
=
[
"out_proj.weight"
,
"down_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_world_size
=
get_tensor_model_parallel_world_size
()
tp_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"Wqkv"
in
name
:
# NOTE(woosuk): MPT's fused QKV has the shape of
# [3 * num_heads * head_size, hidden_size].
# When tensor model parallelism is used, we need to shard
# the weight along the hidden dimension.
total_num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
total_num_heads
num_heads
=
total_num_heads
//
tp_world_size
head_start
=
tp_rank
*
num_heads
head_end
=
(
tp_rank
+
1
)
*
num_heads
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
if
name
.
endswith
(
".weight"
):
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
elif
name
.
endswith
(
".bias"
):
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
)
else
:
raise
ValueError
(
f
"Unexpected parameter name
{
name
}
"
)
name
=
name
.
replace
(
"Wqkv"
,
"qkv_proj"
)
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
server/vllm/vllm/model_executor/models/opt.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The vLLM team.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only OPT model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
OPTConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
OPTLearnedPositionalEmbedding
(
nn
.
Embedding
):
def
__init__
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
):
# OPT is set up so that if padding_idx is specified then offset the
# embedding ids by 2 and adjust num_embeddings appropriately. Other
# models don't have this hack
self
.
offset
=
2
super
().
__init__
(
num_embeddings
+
self
.
offset
,
embedding_dim
)
def
forward
(
self
,
positions
:
torch
.
Tensor
):
return
super
().
forward
(
positions
+
self
.
offset
)
class
OPTAttention
(
nn
.
Module
):
def
__init__
(
self
,
embed_dim
:
int
,
num_heads
:
int
,
bias
:
bool
=
True
,
)
->
None
:
super
().
__init__
()
self
.
embed_dim
=
embed_dim
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
total_num_heads
=
num_heads
assert
num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
self
.
head_dim
=
embed_dim
//
total_num_heads
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
qkv_proj
=
ColumnParallelLinear
(
embed_dim
,
3
*
embed_dim
,
bias
=
bias
,
gather_output
=
False
,
)
self
.
out_proj
=
RowParallelLinear
(
embed_dim
,
embed_dim
,
bias
=
bias
,
input_is_parallel
=
True
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scaling
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
key_cache
,
value_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
key_cache
,
value_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
out_proj
(
attn_output
)
return
output
class
OPTDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
OPTConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
self_attn
=
OPTAttention
(
embed_dim
=
self
.
embed_dim
,
num_heads
=
config
.
num_attention_heads
,
bias
=
config
.
enable_bias
,
)
self
.
do_layer_norm_before
=
config
.
do_layer_norm_before
self
.
activation_fn
=
get_act_fn
(
config
.
activation_function
)
self
.
self_attn_layer_norm
=
nn
.
LayerNorm
(
self
.
embed_dim
,
elementwise_affine
=
config
.
layer_norm_elementwise_affine
)
self
.
fc1
=
ColumnParallelLinear
(
self
.
embed_dim
,
config
.
ffn_dim
,
bias
=
config
.
enable_bias
,
gather_output
=
False
,
)
self
.
fc2
=
RowParallelLinear
(
config
.
ffn_dim
,
self
.
embed_dim
,
bias
=
config
.
enable_bias
,
input_is_parallel
=
True
,
)
self
.
final_layer_norm
=
nn
.
LayerNorm
(
self
.
embed_dim
,
elementwise_affine
=
config
.
layer_norm_elementwise_affine
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
# 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
if
self
.
do_layer_norm_before
:
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
)
hidden_states
=
residual
+
hidden_states
# 350m applies layer norm AFTER attention
if
not
self
.
do_layer_norm_before
:
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
# Fully Connected
residual
=
hidden_states
# 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
if
self
.
do_layer_norm_before
:
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
,
_
=
self
.
fc1
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
,
_
=
self
.
fc2
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
# 350m applies layer norm AFTER attention
if
not
self
.
do_layer_norm_before
:
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
return
hidden_states
class
OPTDecoder
(
nn
.
Module
):
def
__init__
(
self
,
config
:
OPTConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
max_target_positions
=
config
.
max_position_embeddings
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
word_embed_proj_dim
,
)
# Positional embeddings are replicated (not sharded).
self
.
embed_positions
=
OPTLearnedPositionalEmbedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
# Project out & in will be replicated if they exist.
if
config
.
word_embed_proj_dim
!=
config
.
hidden_size
:
self
.
project_out
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
word_embed_proj_dim
,
bias
=
False
)
else
:
self
.
project_out
=
None
if
config
.
word_embed_proj_dim
!=
config
.
hidden_size
:
self
.
project_in
=
nn
.
Linear
(
config
.
word_embed_proj_dim
,
config
.
hidden_size
,
bias
=
False
)
else
:
self
.
project_in
=
None
# Note that the only purpose of `config._remove_final_layer_norm` is to
# keep backward compatibility with checkpoints that have been fine-tuned
# before transformers v4.20.1
# see https://github.com/facebookresearch/metaseq/pull/164
if
config
.
do_layer_norm_before
and
not
config
.
_remove_final_layer_norm
:
self
.
final_layer_norm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
elementwise_affine
=
config
.
layer_norm_elementwise_affine
)
else
:
self
.
final_layer_norm
=
None
self
.
layers
=
nn
.
ModuleList
(
[
OPTDecoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
pos_embeds
=
self
.
embed_positions
(
positions
)
if
self
.
project_in
is
not
None
:
inputs_embeds
=
self
.
project_in
(
inputs_embeds
)
hidden_states
=
inputs_embeds
+
pos_embeds
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
)
if
self
.
final_layer_norm
is
not
None
:
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
if
self
.
project_out
is
not
None
:
hidden_states
=
self
.
project_out
(
hidden_states
)
return
hidden_states
class
OPTModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
OPTConfig
):
super
().
__init__
()
self
.
decoder
=
OPTDecoder
(
config
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
return
self
.
decoder
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
class
OPTForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
model
=
OPTModel
(
config
)
# TODO(zhuohan): create a new weight after implementing pipeline
# parallelism
self
.
lm_head_weight
=
self
.
model
.
decoder
.
embed_tokens
.
weight
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head_weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"embed_tokens.weight"
,
"fc1.weight"
,
"fc1.bias"
]
_row_parallel_weights
=
[
"out_proj.weight"
,
"fc2.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"lm_head.weight"
in
name
:
continue
if
name
.
startswith
(
"decoder."
):
name
=
"model."
+
name
is_attention_weight
=
False
for
stride_id
,
att_weight_name
in
enumerate
(
[
"q_proj"
,
"k_proj"
,
"v_proj"
]):
if
att_weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
att_weight_name
,
"qkv_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
3
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_attention_weight
=
True
break
if
is_attention_weight
:
continue
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
server/vllm/vllm/model_executor/models/qwen.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
,
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
,
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.qwen
import
QWenConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
QWenMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
=
"silu"
,
):
super
().
__init__
()
self
.
gate_up_proj
=
ColumnParallelLinear
(
hidden_size
,
2
*
intermediate_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
c_proj
(
x
)
return
x
class
QWenAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
max_position_embeddings
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tensor_model_parallel_world_size
=
get_tensor_model_parallel_world_size
(
)
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
# pylint: disable=invalid-name
self
.
c_attn
=
ColumnParallelLinear
(
hidden_size
,
3
*
hidden_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
rotary_dim
=
self
.
head_dim
,
base
=
rope_theta
,
max_position
=
max_position_embeddings
,
rope_scaling
=
rope_scaling
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
c_attn
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
c_proj
(
attn_output
)
return
output
class
QWenBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
:
QWenConfig
):
super
().
__init__
()
self
.
ln_1
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
self
.
attn
=
QWenAttention
(
config
.
hidden_size
,
config
.
num_attention_heads
,
config
.
max_position_embeddings
,
rope_theta
=
rope_theta
,
rope_scaling
=
rope_scaling
)
self
.
ln_2
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
QWenMLP
(
config
.
hidden_size
,
config
.
intermediate_size
//
2
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
ln_1
(
hidden_states
)
hidden_states
=
self
.
attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
ln_2
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
QWenModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
QWenConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
vocab_size
=
config
.
vocab_size
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
wte
=
VocabParallelEmbedding
(
vocab_size
,
config
.
hidden_size
,
)
self
.
h
=
nn
.
ModuleList
(
[
QWenBlock
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
ln_f
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
wte
(
input_ids
)
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
QWenLMHeadModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
QWenConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
QWenModel
(
config
)
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
lm_head
=
ColumnParallelLinear
(
config
.
hidden_size
,
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[]
_row_parallel_weights
=
[
"c_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
,
):
tp_world_size
=
get_tensor_model_parallel_world_size
()
tp_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
if
"c_attn"
in
name
:
total_num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
total_num_heads
num_heads
=
total_num_heads
//
tp_world_size
head_start
=
tp_rank
*
num_heads
head_end
=
(
tp_rank
+
1
)
*
num_heads
if
"weight"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
elif
"bias"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
)
is_gate_up_weight
=
False
for
stride_id
,
weight_name
in
enumerate
([
"w2"
,
"w1"
]):
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"gate_up_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
2
loaded_weight
=
loaded_weight
[
shard_size
*
tp_rank
:
shard_size
*
(
tp_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_gate_up_weight
=
True
break
if
is_gate_up_weight
:
continue
param
=
state_dict
[
name
]
if
"wte"
in
name
or
"lm_head"
in
name
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tp_rank
)
continue
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
,
)
server/vllm/vllm/model_executor/parallel_utils/README.md
deleted
100644 → 0
View file @
64def8e2
The files in this folder are ported from
[
Megatron-LM
](
https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core
)
. We only keep the codes that are used in inference.
\ No newline at end of file
server/vllm/vllm/model_executor/parallel_utils/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/model_executor/parallel_utils/communication_op.py
deleted
100644 → 0
View file @
64def8e2
import
torch
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_group
,
)
def
tensor_model_parallel_all_reduce
(
input_
):
"""All-reduce the input tensor across model parallel group.
NOTE: This operation is applied in-place on the input tensor.
"""
# Bypass the function if we are using only 1 GPU.
if
get_tensor_model_parallel_world_size
()
==
1
:
return
input_
# All-reduce.
torch
.
distributed
.
all_reduce
(
input_
,
group
=
get_tensor_model_parallel_group
())
return
input_
def
tensor_model_parallel_all_gather
(
input_
,
dim
=-
1
):
"""All-gather the input tensor across model parallel group."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
assert
-
input_
.
dim
()
<=
dim
<
input_
.
dim
(),
(
f
"Invalid dim (
{
dim
}
) for input tensor with shape
{
input_
.
size
()
}
"
)
if
dim
<
0
:
# Convert negative dim to positive.
dim
+=
input_
.
dim
()
input_size
=
input_
.
size
()
# Allocate output tensor.
output_tensor
=
torch
.
empty
((
world_size
,
)
+
input_size
,
dtype
=
input_
.
dtype
,
device
=
input_
.
device
)
# All-gather.
torch
.
distributed
.
all_gather_into_tensor
(
output_tensor
,
input_
,
group
=
get_tensor_model_parallel_group
())
# Reshape
output_tensor
=
output_tensor
.
movedim
(
0
,
dim
)
output_tensor
=
output_tensor
.
reshape
(
input_size
[:
dim
]
+
(
world_size
*
input_size
[
dim
],
)
+
input_size
[
dim
+
1
:])
return
output_tensor
server/vllm/vllm/model_executor/parallel_utils/layers.py
deleted
100644 → 0
View file @
64def8e2
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch
from
typing
import
Optional
import
torch
import
torch.nn.functional
as
F
from
torch.nn.parameter
import
Parameter
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.quantization_utils
import
QuantizationConfig
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
,
tensor_model_parallel_all_gather
)
from
vllm.model_executor.parallel_utils.utils
import
(
divide
,
VocabUtility
,
split_tensor_along_last_dim
,
)
class
VocabParallelEmbedding
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
This is mainly adapted from torch.nn.Embedding and all the default
values are kept.
Arguments:
num_embeddings: vocabulary size.
embedding_dim: size of hidden state.
params_dtype: type of the parameters.
"""
def
__init__
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
):
super
().
__init__
()
# Keep the input dimensions.
self
.
num_embeddings
=
num_embeddings
self
.
embedding_dim
=
embedding_dim
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
# TODO: Handle vocab padding here.
# Divide the weight matrix along the vocaburaly dimension.
self
.
vocab_start_index
,
self
.
vocab_end_index
=
(
VocabUtility
.
vocab_range_from_global_vocab_size
(
self
.
num_embeddings
,
get_tensor_model_parallel_rank
(),
self
.
tp_size
))
self
.
num_embeddings_per_partition
=
(
self
.
vocab_end_index
-
self
.
vocab_start_index
)
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
params_dtype
))
def
forward
(
self
,
input_
):
if
self
.
tp_size
>
1
:
# Build the mask.
input_mask
=
((
input_
<
self
.
vocab_start_index
)
|
(
input_
>=
self
.
vocab_end_index
))
# Mask the input.
masked_input
=
input_
.
clone
()
-
self
.
vocab_start_index
masked_input
[
input_mask
]
=
0
else
:
masked_input
=
input_
# Get the embeddings.
output_parallel
=
F
.
embedding
(
masked_input
,
self
.
weight
)
# Mask the output embedding.
if
self
.
tp_size
>
1
:
output_parallel
[
input_mask
,
:]
=
0.0
# Reduce across all the model parallel GPUs.
output
=
tensor_model_parallel_all_reduce
(
output_parallel
)
return
output
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with column parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its second dimension as A = [A_1, ..., A_p].
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
Keyword Arguments
bias: If true, add bias
gather_output: If true, call all-gather on output and make Y available
to all GPUs, otherwise, every GPU will have its output
which is Y_i = XA_i
skip_bias_add: This was added to enable performance optimizations where
bias can be fused with other element-wise operations. we
skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configuration.
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
bias
:
bool
=
True
,
gather_output
:
bool
=
True
,
skip_bias_add
:
bool
=
False
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
gather_output
=
gather_output
# Divide the weight matrix along the last dimension.
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
output_size_per_partition
=
divide
(
output_size
,
self
.
tp_size
)
self
.
skip_bias_add
=
skip_bias_add
self
.
quant_config
=
quant_config
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
# Parameters.
# NOTE: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
self
.
create_weights
(
params_dtype
)
if
bias
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
params_dtype
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
def
create_weights
(
self
,
dtype
:
torch
.
dtype
)
->
None
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
dtype
))
def
apply_weights
(
self
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
return
F
.
linear
(
x
,
self
.
weight
,
bias
)
def
forward
(
self
,
input_
):
"""Forward of ColumnParallelLinear
Args:
input_: Tensor whose last dimension is `input_size`.
Returns:
- output
- bias
"""
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
input_parallel
=
input_
# Matrix multiply.
output_parallel
=
self
.
apply_weights
(
input_parallel
,
bias
)
if
self
.
gather_output
:
# All-gather across the partitions.
output
=
tensor_model_parallel_all_gather
(
output_parallel
)
else
:
output
=
output_parallel
output_bias
=
self
.
bias
if
self
.
skip_bias_add
else
None
return
output
,
output_bias
class
RowParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with row parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its first dimension and X along its second dimension as:
- -
| A_1 |
| . |
A = | . | X = [X_1, ..., X_p]
| . |
| A_p |
- -
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
Keyword Arguments:
bias: If true, add bias. Note that bias is not parallelized.
input_is_parallel: If true, we assume that the input is already
split across the GPUs and we do not split
again.
skip_bias_add: This was added to enable performance optimization where
bias can be fused with other element-wise operations.
We skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configuration.
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
bias
:
bool
=
True
,
input_is_parallel
:
bool
=
False
,
skip_bias_add
:
bool
=
False
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
reduce_results
:
bool
=
True
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
input_is_parallel
=
input_is_parallel
self
.
reduce_results
=
reduce_results
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
# Divide the weight matrix along the last dimension.
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
input_size_per_partition
=
divide
(
input_size
,
self
.
tp_size
)
self
.
skip_bias_add
=
skip_bias_add
self
.
quant_config
=
quant_config
self
.
create_weights
(
params_dtype
)
if
not
reduce_results
and
(
bias
and
not
skip_bias_add
):
raise
ValueError
(
'When not reduce the results, adding bias to the '
'results can lead to incorrect results'
)
if
bias
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
params_dtype
))
# Always initialize bias to zero.
with
torch
.
no_grad
():
self
.
bias
.
zero_
()
else
:
self
.
register_parameter
(
'bias'
,
None
)
def
create_weights
(
self
,
dtype
:
torch
.
dtype
)
->
None
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
dtype
))
def
apply_weights
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
F
.
linear
(
x
,
self
.
weight
)
def
forward
(
self
,
input_
):
"""Forward of RowParallelLinear
Args:
input_: tensor whose last dimension is `input_size`. If
`input_is_parallel` is set, then the last dimension
is `input_size // tp_size`.
Returns:
- output
- bias
"""
# Set up backprop all-reduce.
if
self
.
input_is_parallel
:
input_parallel
=
input_
else
:
# TODO: simplify code below
tp_rank
=
get_tensor_model_parallel_rank
()
splitted_input
=
split_tensor_along_last_dim
(
input_
,
num_partitions
=
self
.
tp_size
)
input_parallel
=
splitted_input
[
tp_rank
].
contiguous
()
# Matrix multiply.
output_parallel
=
self
.
apply_weights
(
input_parallel
)
if
self
.
reduce_results
and
self
.
tp_size
>
1
:
output_
=
tensor_model_parallel_all_reduce
(
output_parallel
)
else
:
output_
=
output_parallel
if
not
self
.
skip_bias_add
:
output
=
output_
+
self
.
bias
if
self
.
bias
is
not
None
else
output_
output_bias
=
None
else
:
output
=
output_
output_bias
=
self
.
bias
return
output
,
output_bias
server/vllm/vllm/model_executor/parallel_utils/parallel_state.py
deleted
100644 → 0
View file @
64def8e2
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import
torch
# Tensor model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP
=
None
# Pipeline model parallel group that the current rank belongs to.
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
# A list of global ranks for each pipeline group to ease calculation of the
# source rank when broadcasting from the first or last pipeline stage.
_PIPELINE_GLOBAL_RANKS
=
None
def
initialize_model_parallel
(
tensor_model_parallel_size
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
)
->
None
:
"""
Initialize model parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used for tensor model
parallelism.
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
4 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 pipeline model-parallel groups:
[g0, g2, g4, g6], [g1, g3, g5, g7]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
if
(
world_size
!=
tensor_model_parallel_size
*
pipeline_model_parallel_size
):
raise
RuntimeError
(
f
"world_size (
{
world_size
}
) is not equal to "
f
"tensor_model_parallel_size (
{
tensor_model_parallel_size
}
) x "
f
"pipeline_model_parallel_size (
{
pipeline_model_parallel_size
}
)"
)
num_tensor_model_parallel_groups
:
int
=
(
world_size
//
tensor_model_parallel_size
)
num_pipeline_model_parallel_groups
:
int
=
(
world_size
//
pipeline_model_parallel_size
)
rank
=
torch
.
distributed
.
get_rank
()
# Build the tensor model-parallel groups.
global
_TENSOR_MODEL_PARALLEL_GROUP
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_TENSOR_MODEL_PARALLEL_GROUP
=
group
# Build the pipeline model-parallel groups.
global
_PIPELINE_MODEL_PARALLEL_GROUP
global
_PIPELINE_GLOBAL_RANKS
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
,
(
"pipeline model parallel group is already initialized"
)
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_PIPELINE_MODEL_PARALLEL_GROUP
=
group
_PIPELINE_GLOBAL_RANKS
=
ranks
def
model_parallel_is_initialized
():
"""Check if model and data parallel groups are initialized."""
return
(
_TENSOR_MODEL_PARALLEL_GROUP
is
not
None
and
_PIPELINE_MODEL_PARALLEL_GROUP
is
not
None
)
def
get_tensor_model_parallel_group
():
"""Get the tensor model parallel group the caller rank belongs to."""
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
not
None
,
(
"tenosr model parallel group is not initialized"
)
return
_TENSOR_MODEL_PARALLEL_GROUP
def
get_pipeline_model_parallel_group
():
"""Get the pipeline model parallel group the caller rank belongs to."""
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
not
None
,
(
"pipeline model parallel group is not initialized"
)
return
_PIPELINE_MODEL_PARALLEL_GROUP
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_pipeline_model_parallel_world_size
():
"""Return world size for the pipeline model parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_pipeline_model_parallel_group
())
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_pipeline_model_parallel_rank
():
"""Return my rank for the pipeline model parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_pipeline_model_parallel_group
())
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
def
get_pipeline_model_parallel_first_rank
():
"""Return the global rank of the first process in the pipeline for the
current tensor parallel group"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
(
"Pipeline parallel group is not initialized"
)
return
_PIPELINE_GLOBAL_RANKS
[
0
]
def
get_pipeline_model_parallel_last_rank
():
"""Return the global rank of the last process in the pipeline for the
current tensor parallel group"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
(
"Pipeline parallel group is not initialized"
)
last_rank_local
=
get_pipeline_model_parallel_world_size
()
-
1
return
_PIPELINE_GLOBAL_RANKS
[
last_rank_local
]
def
get_pipeline_model_parallel_next_rank
():
"""Return the global rank that follows the caller in the pipeline"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
(
"Pipeline parallel group is not initialized"
)
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
+
1
)
%
world_size
]
def
get_pipeline_model_parallel_prev_rank
():
"""Return the global rank that preceeds the caller in the pipeline"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
(
"Pipeline parallel group is not initialized"
)
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
-
1
)
%
world_size
]
def
destroy_model_parallel
():
"""Set the groups to none."""
global
_TENSOR_MODEL_PARALLEL_GROUP
_TENSOR_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_MODEL_PARALLEL_GROUP
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_GLOBAL_RANKS
_PIPELINE_GLOBAL_RANKS
=
None
server/vllm/vllm/model_executor/parallel_utils/utils.py
deleted
100644 → 0
View file @
64def8e2
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from
typing
import
List
,
Sequence
import
torch
def
ensure_divisibility
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator."""
assert
numerator
%
denominator
==
0
,
"{} is not divisible by {}"
.
format
(
numerator
,
denominator
)
def
divide
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator and return
the division value."""
ensure_divisibility
(
numerator
,
denominator
)
return
numerator
//
denominator
def
split_tensor_along_last_dim
(
tensor
:
torch
.
Tensor
,
num_partitions
:
int
,
contiguous_split_chunks
:
bool
=
False
,
)
->
List
[
torch
.
Tensor
]:
""" Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
Returns:
A list of Tensors
"""
# Get the size and dimension.
last_dim
=
tensor
.
dim
()
-
1
last_dim_size
=
divide
(
tensor
.
size
()[
last_dim
],
num_partitions
)
# Split.
tensor_list
=
torch
.
split
(
tensor
,
last_dim_size
,
dim
=
last_dim
)
# NOTE: torch.split does not create contiguous tensors by default.
if
contiguous_split_chunks
:
return
tuple
(
chunk
.
contiguous
()
for
chunk
in
tensor_list
)
return
tensor_list
class
VocabUtility
:
""" Split the vocabulary into `world_size` chunks and return the first
and last index of the vocabulary belonging to the `rank`
partition: Note that indices in [fist, last)
"""
@
staticmethod
def
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
:
int
,
rank
:
int
)
->
Sequence
[
int
]:
index_f
=
rank
*
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
return
index_f
,
index_l
@
staticmethod
def
vocab_range_from_global_vocab_size
(
global_vocab_size
:
int
,
rank
:
int
,
world_size
:
int
)
->
Sequence
[
int
]:
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
return
VocabUtility
.
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
)
server/vllm/vllm/model_executor/quantization_utils/__init__.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Type
from
vllm.model_executor.quantization_utils.awq
import
AWQConfig
from
vllm.model_executor.quantization_utils.base
import
QuantizationConfig
_QUANTIZATION_REGISTRY
=
{
"awq"
:
AWQConfig
,
}
def
get_quant_class
(
quantization
:
str
)
->
Type
[
QuantizationConfig
]:
if
quantization
not
in
_QUANTIZATION_REGISTRY
:
raise
ValueError
(
f
"Invalid quantization method:
{
quantization
}
"
)
return
_QUANTIZATION_REGISTRY
[
quantization
]
__all__
=
[
"QuantizationConfig"
,
"get_quant_class"
,
]
server/vllm/vllm/model_executor/quantization_utils/awq.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Any
,
Dict
,
List
import
torch
from
vllm.model_executor.quantization_utils.base
import
QuantizationConfig
class
AWQConfig
(
QuantizationConfig
):
"""Config class for AWQ.
Reference: https://arxiv.org/abs/2306.00978
"""
def
__init__
(
self
,
weight_bits
:
int
,
group_size
:
int
,
zero_point
:
bool
,
)
->
None
:
self
.
weight_bits
=
weight_bits
self
.
group_size
=
group_size
self
.
zero_point
=
zero_point
if
self
.
weight_bits
!=
4
:
raise
ValueError
(
"Currently, only 4-bit weight quantization is supported for "
f
"AWQ, but got
{
self
.
weight_bits
}
bits."
)
self
.
pack_factor
=
32
//
self
.
weight_bits
def
__repr__
(
self
)
->
str
:
return
(
f
"AWQConfig(weight_bits=
{
self
.
weight_bits
}
, "
f
"group_size=
{
self
.
group_size
}
, "
f
"zero_point=
{
self
.
zero_point
}
)"
)
@
classmethod
def
get_name
(
cls
)
->
str
:
return
"awq"
@
classmethod
def
get_supported_act_dtypes
(
cls
)
->
List
[
torch
.
dtype
]:
return
[
torch
.
half
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
# The AWQ kernel only supports Turing or newer GPUs.
return
75
@
classmethod
def
get_config_filenames
(
cls
)
->
List
[
str
]:
return
[
"quant_config.json"
,
# E.g., casperhansen/vicuna-7b-v1.5-awq
"quantize_config.json"
,
# E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq # pylint: disable=line-too-long
]
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"AWQConfig"
:
weight_bits
=
cls
.
get_from_keys
(
config
,
[
"w_bit"
,
"bits"
])
group_size
=
cls
.
get_from_keys
(
config
,
[
"q_group_size"
,
"group_size"
])
zero_point
=
cls
.
get_from_keys
(
config
,
[
"zero_point"
])
return
cls
(
weight_bits
,
group_size
,
zero_point
)
@
classmethod
def
get_packed_tensor_names
(
cls
)
->
List
[
str
]:
return
[
"qweight"
,
"qzeros"
]
@
classmethod
def
get_transposed_tensor_names
(
cls
)
->
List
[
str
]:
return
[
"qweight"
,
"qzeros"
,
"scales"
]
@
classmethod
def
get_tp_tensor_names
(
cls
)
->
List
[
str
]:
return
[
"qweight"
,
"qzeros"
,
"scales"
]
server/vllm/vllm/model_executor/quantization_utils/base.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Any
,
Dict
,
List
import
torch
class
QuantizationConfig
:
@
classmethod
def
get_name
(
cls
)
->
str
:
"""Name of the quantization method."""
raise
NotImplementedError
@
classmethod
def
get_supported_act_dtypes
(
cls
)
->
List
[
torch
.
dtype
]:
"""List of supported activation dtypes."""
raise
NotImplementedError
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
"""Minimum GPU capability to support the quantization method.
E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
This requirement is due to the custom CUDA kernels used by the
quantization method.
"""
raise
NotImplementedError
@
classmethod
def
get_config_filenames
(
cls
)
->
List
[
str
]:
"""List of filenames to search for in the model directory."""
raise
NotImplementedError
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"QuantizationConfig"
:
"""Create a config class from the model's quantization config."""
raise
NotImplementedError
@
staticmethod
def
get_from_keys
(
config
:
Dict
[
str
,
Any
],
keys
:
List
[
str
])
->
Any
:
"""Get a value from the model's quantization config."""
for
key
in
keys
:
if
key
in
config
:
return
config
[
key
]
raise
ValueError
(
f
"Cannot find any of
{
keys
}
in the model's "
"quantization config."
)
@
classmethod
def
get_packed_tensor_names
(
cls
)
->
List
[
str
]:
raise
NotImplementedError
@
classmethod
def
is_packed
(
cls
,
tensor_name
:
str
)
->
bool
:
"""Returns True if a tensor is packed.
A tensor is considered packed if each element in the tensor is a
packed representation of multiple elements in the original tensor.
For example, an INT32 element in the tensor may represent 8 INT4
elements in the original tensor.
"""
return
any
(
tag
in
tensor_name
for
tag
in
cls
.
get_packed_tensor_names
())
@
classmethod
def
get_transposed_tensor_names
(
cls
)
->
List
[
str
]:
raise
NotImplementedError
@
classmethod
def
is_transposed
(
cls
,
tensor_name
:
str
)
->
bool
:
"""Returns True if a tensor is transposed relative to nn.Linear.weight.
"""
return
any
(
tag
in
tensor_name
for
tag
in
cls
.
get_transposed_tensor_names
())
@
classmethod
def
get_tp_tensor_names
(
cls
)
->
List
[
str
]:
raise
NotImplementedError
server/vllm/vllm/model_executor/utils.py
deleted
100644 → 0
View file @
64def8e2
"""Utils for model executor."""
import
random
import
numpy
as
np
import
torch
def
set_random_seed
(
seed
:
int
)
->
None
:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
server/vllm/vllm/model_executor/weight_utils.py
deleted
100644 → 0
View file @
64def8e2
"""Utilities for downloading and initializing model weights."""
import
filelock
import
glob
import
json
import
os
from
collections
import
defaultdict
from
typing
import
Any
,
Iterator
,
List
,
Optional
,
Tuple
from
huggingface_hub
import
snapshot_download
from
safetensors.torch
import
load_file
,
save_file
,
safe_open
import
numpy
as
np
import
torch
from
tqdm.auto
import
tqdm
from
vllm.logger
import
init_logger
from
vllm.model_executor.quantization_utils
import
get_quant_class
from
vllm.model_executor.quantization_utils.base
import
QuantizationConfig
logger
=
init_logger
(
__name__
)
class
Disabledtqdm
(
tqdm
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
,
disable
=
True
)
def
get_lock
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
):
lock_dir
=
cache_dir
if
cache_dir
is
not
None
else
"/tmp"
lock_file_name
=
model_name_or_path
.
replace
(
"/"
,
"-"
)
+
".lock"
lock
=
filelock
.
FileLock
(
os
.
path
.
join
(
lock_dir
,
lock_file_name
))
return
lock
def
_shared_pointers
(
tensors
):
ptrs
=
defaultdict
(
list
)
for
k
,
v
in
tensors
.
items
():
ptrs
[
v
.
data_ptr
()].
append
(
k
)
failing
=
[]
for
_
,
names
in
ptrs
.
items
():
if
len
(
names
)
>
1
:
failing
.
append
(
names
)
return
failing
def
convert_bin_to_safetensor_file
(
pt_filename
:
str
,
sf_filename
:
str
,
)
->
None
:
loaded
=
torch
.
load
(
pt_filename
,
map_location
=
"cpu"
)
if
"state_dict"
in
loaded
:
loaded
=
loaded
[
"state_dict"
]
shared
=
_shared_pointers
(
loaded
)
for
shared_weights
in
shared
:
for
name
in
shared_weights
[
1
:]:
loaded
.
pop
(
name
)
# For tensors to be contiguous
loaded
=
{
k
:
v
.
contiguous
()
for
k
,
v
in
loaded
.
items
()}
dirname
=
os
.
path
.
dirname
(
sf_filename
)
os
.
makedirs
(
dirname
,
exist_ok
=
True
)
save_file
(
loaded
,
sf_filename
,
metadata
=
{
"format"
:
"pt"
})
# check file size
sf_size
=
os
.
stat
(
sf_filename
).
st_size
pt_size
=
os
.
stat
(
pt_filename
).
st_size
if
(
sf_size
-
pt_size
)
/
pt_size
>
0.01
:
raise
RuntimeError
(
f
"""The file size different is more than 1%:
-
{
sf_filename
}
:
{
sf_size
}
-
{
pt_filename
}
:
{
pt_size
}
"""
)
# check if the tensors are the same
reloaded
=
load_file
(
sf_filename
)
for
k
in
loaded
:
pt_tensor
=
loaded
[
k
]
sf_tensor
=
reloaded
[
k
]
if
not
torch
.
equal
(
pt_tensor
,
sf_tensor
):
raise
RuntimeError
(
f
"The output tensors do not match for key
{
k
}
"
)
# TODO(woosuk): Move this to other place.
def
get_quant_config
(
quantization
:
str
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
)
->
QuantizationConfig
:
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
if
not
is_local
:
# Download the config files.
with
get_lock
(
model_name_or_path
,
cache_dir
):
hf_folder
=
snapshot_download
(
model_name_or_path
,
allow_patterns
=
"*.json"
,
cache_dir
=
cache_dir
,
tqdm_class
=
Disabledtqdm
)
else
:
hf_folder
=
model_name_or_path
config_files
=
glob
.
glob
(
os
.
path
.
join
(
hf_folder
,
"*.json"
))
quant_cls
=
get_quant_class
(
quantization
)
quant_config_files
=
[
f
for
f
in
config_files
if
any
(
f
.
endswith
(
x
)
for
x
in
quant_cls
.
get_config_filenames
())
]
if
len
(
quant_config_files
)
==
0
:
raise
ValueError
(
f
"Cannot find the config file for
{
quantization
}
"
)
if
len
(
quant_config_files
)
>
1
:
raise
ValueError
(
f
"Found multiple config files for
{
quantization
}
: "
f
"
{
quant_config_files
}
"
)
quant_config_file
=
quant_config_files
[
0
]
with
open
(
quant_config_file
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
return
quant_cls
.
from_config
(
config
)
def
prepare_hf_model_weights
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
use_safetensors
:
bool
=
False
,
fall_back_to_pt
:
bool
=
True
,
revision
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
str
,
List
[
str
],
bool
]:
# Download model weights from huggingface.
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
if
use_safetensors
:
allow_patterns
=
[
"*.safetensors"
]
else
:
# Some quantized models use .pt files for storing the weights.
allow_patterns
=
[
"*.bin"
,
"*.pt"
]
if
not
is_local
:
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
hf_folder
=
snapshot_download
(
model_name_or_path
,
allow_patterns
=
allow_patterns
,
cache_dir
=
cache_dir
,
tqdm_class
=
Disabledtqdm
,
revision
=
revision
)
else
:
hf_folder
=
model_name_or_path
hf_weights_files
:
List
[
str
]
=
[]
for
pattern
in
allow_patterns
:
hf_weights_files
+=
glob
.
glob
(
os
.
path
.
join
(
hf_folder
,
pattern
))
if
not
use_safetensors
:
# Exclude files that are not needed for inference.
# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
blacklist
=
[
"training_args.bin"
,
"optimizer.bin"
,
"optimizer.pt"
,
"scheduler.pt"
,
"scaler.pt"
,
]
hf_weights_files
=
[
f
for
f
in
hf_weights_files
if
not
any
(
f
.
endswith
(
x
)
for
x
in
blacklist
)
]
if
len
(
hf_weights_files
)
==
0
and
use_safetensors
and
fall_back_to_pt
:
return
prepare_hf_model_weights
(
model_name_or_path
,
cache_dir
=
cache_dir
,
use_safetensors
=
False
,
fall_back_to_pt
=
False
,
revision
=
revision
)
if
len
(
hf_weights_files
)
==
0
:
raise
RuntimeError
(
f
"Cannot find any model weights with `
{
model_name_or_path
}
`"
)
return
hf_folder
,
hf_weights_files
,
use_safetensors
def
hf_model_weights_iterator
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
,
)
->
Iterator
[
Tuple
[
str
,
torch
.
Tensor
]]:
use_safetensors
=
False
use_np_cache
=
False
fall_back_to_pt
=
False
if
load_format
==
"auto"
:
use_safetensors
=
True
fall_back_to_pt
=
True
elif
load_format
==
"safetensors"
:
use_safetensors
=
True
elif
load_format
==
"pt"
:
pass
elif
load_format
==
"npcache"
:
use_np_cache
=
True
else
:
raise
ValueError
(
f
"Unknown load_format:
{
load_format
}
"
)
hf_folder
,
hf_weights_files
,
use_safetensors
=
prepare_hf_model_weights
(
model_name_or_path
,
cache_dir
=
cache_dir
,
use_safetensors
=
use_safetensors
,
fall_back_to_pt
=
fall_back_to_pt
,
revision
=
revision
)
if
use_np_cache
:
# Currently np_cache only support *.bin checkpoints
assert
use_safetensors
is
False
# Convert the model weights from torch tensors to numpy arrays for
# faster loading.
np_folder
=
os
.
path
.
join
(
hf_folder
,
"np"
)
os
.
makedirs
(
np_folder
,
exist_ok
=
True
)
weight_names_file
=
os
.
path
.
join
(
np_folder
,
"weight_names.json"
)
# Use file lock to prevent multiple processes from
# dumping the same model weights to numpy at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
if
not
os
.
path
.
exists
(
weight_names_file
):
weight_names
=
[]
for
bin_file
in
hf_weights_files
:
state
=
torch
.
load
(
bin_file
,
map_location
=
"cpu"
)
for
name
,
param
in
state
.
items
():
param_path
=
os
.
path
.
join
(
np_folder
,
name
)
with
open
(
param_path
,
"wb"
)
as
f
:
np
.
save
(
f
,
param
.
cpu
().
detach
().
numpy
())
weight_names
.
append
(
name
)
with
open
(
weight_names_file
,
"w"
)
as
f
:
json
.
dump
(
weight_names
,
f
)
with
open
(
weight_names_file
,
"r"
)
as
f
:
weight_names
=
json
.
load
(
f
)
for
name
in
weight_names
:
param_path
=
os
.
path
.
join
(
np_folder
,
name
)
with
open
(
param_path
,
"rb"
)
as
f
:
param
=
np
.
load
(
f
)
yield
name
,
torch
.
from_numpy
(
param
)
elif
use_safetensors
:
for
st_file
in
hf_weights_files
:
with
safe_open
(
st_file
,
framework
=
"pt"
)
as
f
:
for
name
in
f
.
keys
():
param
=
f
.
get_slice
(
name
)
yield
name
,
param
else
:
for
bin_file
in
hf_weights_files
:
state
=
torch
.
load
(
bin_file
,
map_location
=
"cpu"
)
for
name
,
param
in
state
.
items
():
yield
name
,
param
del
state
torch
.
cuda
.
empty_cache
()
def
convert_pyslice_to_tensor
(
x
:
Any
)
->
torch
.
Tensor
:
"""convert PySafeSlice object from safetensors to torch.Tensor
PySafeSlice object supports indexing, which is done before loading the
actual tensor and can reduce the amount of memory being read into the
memory. However, it does not support more advanced functionalities
like `.view()` or `.t()`. Therefore, if we need to modify the loaded
tensor with these more complicated operators, we need to convert to
tensor first.
"""
if
not
isinstance
(
x
,
torch
.
Tensor
):
x
=
x
[:]
return
x
def
load_padded_tensor_parallel_vocab
(
param
:
torch
.
Tensor
,
loaded_weight
:
Any
,
# `torch.Tensor` or `PySafeSlice`
tensor_model_parallel_rank
:
int
,
)
->
None
:
shard_size
=
param
.
shape
[
0
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[
start_idx
:
end_idx
]
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
param
[:
loaded_weight
.
shape
[
0
]].
copy_
(
loaded_weight
)
def
load_tensor_parallel_weights
(
param
:
torch
.
Tensor
,
loaded_weight
:
Any
,
# `torch.Tensor` or `PySafeSlice`
param_name
:
str
,
column_parallel_weight_names
:
List
[
str
],
row_parallel_weight_names
:
List
[
str
],
tensor_model_parallel_rank
:
int
,
)
->
None
:
for
p
in
column_parallel_weight_names
:
if
p
in
param_name
:
shard_size
=
param
.
shape
[
0
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[
start_idx
:
end_idx
]
break
for
p
in
row_parallel_weight_names
:
if
p
in
param_name
:
shard_size
=
param
.
shape
[
1
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[:,
start_idx
:
end_idx
]
break
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
assert
param
.
shape
==
loaded_weight
.
shape
,
(
f
"
{
param_name
}
shape mismatch between model and checkpoint: "
f
"
{
param
.
shape
}
!=
{
loaded_weight
.
shape
}
"
)
param
.
data
.
copy_
(
loaded_weight
)
def
initialize_dummy_weights
(
model
:
torch
.
nn
.
Module
,
low
:
float
=
-
1e-3
,
high
:
float
=
1e-3
,
)
->
None
:
"""Initialize model weights with random values.
The model weights must be randomly initialized for accurate performance
measurements. Additionally, the model weights should not cause NaNs in the
forward pass. We empirically found that initializing the weights with
values between -1e-3 and 1e-3 works well for most models.
"""
for
param
in
model
.
state_dict
().
values
():
param
.
data
.
uniform_
(
low
,
high
)
server/vllm/vllm/outputs.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
List
,
Optional
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SequenceGroup
,
SequenceStatus
)
class
CompletionOutput
:
"""The output data of one completion output of a request.
Args:
index: The index of the output in the request.
text: The generated output text.
token_ids: The token IDs of the generated output text.
cumulative_logprob: The cumulative log probability of the generated
output text.
logprobs: The log probabilities of the top probability words at each
position if the logprobs are requested.
finish_reason: The reason why the sequence is finished.
"""
def
__init__
(
self
,
index
:
int
,
text
:
str
,
token_ids
:
List
[
int
],
cumulative_logprob
:
float
,
logprobs
:
Optional
[
SampleLogprobs
],
finish_reason
:
Optional
[
str
]
=
None
,
)
->
None
:
self
.
index
=
index
self
.
text
=
text
self
.
token_ids
=
token_ids
self
.
cumulative_logprob
=
cumulative_logprob
self
.
logprobs
=
logprobs
self
.
finish_reason
=
finish_reason
def
finished
(
self
)
->
bool
:
return
self
.
finish_reason
is
not
None
def
__repr__
(
self
)
->
str
:
return
(
f
"CompletionOutput(index=
{
self
.
index
}
, "
f
"text=
{
self
.
text
!
r
}
, "
f
"token_ids=
{
self
.
token_ids
}
, "
f
"cumulative_logprob=
{
self
.
cumulative_logprob
}
, "
f
"logprobs=
{
self
.
logprobs
}
, "
f
"finish_reason=
{
self
.
finish_reason
}
)"
)
class
RequestOutput
:
"""The output data of a request to the LLM.
Args:
request_id: The unique ID of the request.
prompt: The prompt string of the request.
prompt_token_ids: The token IDs of the prompt.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
"""
def
__init__
(
self
,
request_id
:
str
,
prompt
:
str
,
prompt_token_ids
:
List
[
int
],
prompt_logprobs
:
Optional
[
PromptLogprobs
],
outputs
:
List
[
CompletionOutput
],
finished
:
bool
,
)
->
None
:
self
.
request_id
=
request_id
self
.
prompt
=
prompt
self
.
prompt_token_ids
=
prompt_token_ids
self
.
prompt_logprobs
=
prompt_logprobs
self
.
outputs
=
outputs
self
.
finished
=
finished
@
classmethod
def
from_seq_group
(
cls
,
seq_group
:
SequenceGroup
)
->
"RequestOutput"
:
# Get the top-n sequences.
n
=
seq_group
.
sampling_params
.
n
seqs
=
seq_group
.
get_seqs
()
if
seq_group
.
sampling_params
.
use_beam_search
:
sorting_key
=
lambda
seq
:
seq
.
get_beam_search_score
(
seq_group
.
sampling_params
.
length_penalty
)
else
:
sorting_key
=
lambda
seq
:
seq
.
get_cumulative_logprob
()
sorted_seqs
=
sorted
(
seqs
,
key
=
sorting_key
,
reverse
=
True
)
top_n_seqs
=
sorted_seqs
[:
n
]
# Create the outputs.
outputs
:
List
[
CompletionOutput
]
=
[]
for
seq
in
top_n_seqs
:
logprobs
=
seq
.
output_logprobs
if
seq_group
.
sampling_params
.
logprobs
is
None
:
# NOTE: We need to take care of this case because the sequence
# always has the logprobs of the sampled tokens even if the
# logprobs are not requested.
logprobs
=
None
finshed_reason
=
SequenceStatus
.
get_finished_reason
(
seq
.
status
)
output
=
CompletionOutput
(
seqs
.
index
(
seq
),
seq
.
output_text
,
seq
.
get_output_token_ids
(),
seq
.
get_cumulative_logprob
(),
logprobs
,
finshed_reason
)
outputs
.
append
(
output
)
# Every sequence in the sequence group should have the same prompt.
prompt
=
seq_group
.
prompt
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_logprobs
=
seq_group
.
prompt_logprobs
finished
=
seq_group
.
is_finished
()
return
cls
(
seq_group
.
request_id
,
prompt
,
prompt_token_ids
,
prompt_logprobs
,
outputs
,
finished
)
def
__repr__
(
self
)
->
str
:
return
(
f
"RequestOutput(request_id=
{
self
.
request_id
}
, "
f
"prompt=
{
self
.
prompt
!
r
}
, "
f
"prompt_token_ids=
{
self
.
prompt_token_ids
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"outputs=
{
self
.
outputs
}
, "
f
"finished=
{
self
.
finished
}
)"
)
server/vllm/vllm/sampling_params.py
deleted
100644 → 0
View file @
64def8e2
"""Sampling parameters for text generation."""
from
enum
import
IntEnum
from
functools
import
cached_property
from
typing
import
List
,
Optional
,
Union
_SAMPLING_EPS
=
1e-5
class
SamplingType
(
IntEnum
):
GREEDY
=
0
RANDOM
=
1
BEAM
=
2
class
SamplingParams
:
"""Sampling parameters for text generation.
Overall, we follow the sampling parameters from the OpenAI text completion
API (https://platform.openai.com/docs/api-reference/completions/create).
In addition, we support beam search, which is not supported by OpenAI.
Args:
n: Number of output sequences to return for the given prompt.
best_of: Number of output sequences that are generated from the prompt.
From these `best_of` sequences, the top `n` sequences are returned.
`best_of` must be greater than or equal to `n`. This is treated as
the beam width when `use_beam_search` is True. By default, `best_of`
is set to `n`.
presence_penalty: Float that penalizes new tokens based on whether they
appear in the generated text so far. Values > 0 encourage the model
to use new tokens, while values < 0 encourage the model to repeat
tokens.
frequency_penalty: Float that penalizes new tokens based on their
frequency in the generated text so far. Values > 0 encourage the
model to use new tokens, while values < 0 encourage the model to
repeat tokens.
temperature: Float that controls the randomness of the sampling. Lower
values make the model more deterministic, while higher values make
the model more random. Zero means greedy sampling.
top_p: Float that controls the cumulative probability of the top tokens
to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
top_k: Integer that controls the number of top tokens to consider. Set
to -1 to consider all tokens.
use_beam_search: Whether to use beam search instead of sampling.
length_penalty: Float that penalizes sequences based on their length.
Used in beam search.
early_stopping: Controls the stopping condition for beam search. It
accepts the following values: `True`, where the generation stops as
soon as there are `best_of` complete candidates; `False`, where an
heuristic is applied and the generation stops when is it very
unlikely to find better candidates; `"never"`, where the beam search
procedure only stops when there cannot be better candidates
(canonical beam search algorithm).
stop: List of strings that stop the generation when they are generated.
The returned output will not contain the stop strings.
stop_token_ids: List of tokens that stop the generation when they are
generated. The returned output will contain the stop tokens unless
the stop tokens are sepcial tokens.
ignore_eos: Whether to ignore the EOS token and continue generating
tokens after the EOS token is generated.
max_tokens: Maximum number of tokens to generate per output sequence.
logprobs: Number of log probabilities to return per output token.
Note that the implementation follows the OpenAI API: The return
result includes the log probabilities on the `logprobs` most likely
tokens, as well the chosen tokens. The API will always return the
log probability of the sampled token, so there may be up to
`logprobs+1` elements in the response.
prompt_logprobs: Number of log probabilities to return per prompt token.
skip_special_tokens: Whether to skip special tokens in the output.
"""
def
__init__
(
self
,
n
:
int
=
1
,
best_of
:
Optional
[
int
]
=
None
,
presence_penalty
:
float
=
0.0
,
frequency_penalty
:
float
=
0.0
,
temperature
:
float
=
1.0
,
top_p
:
float
=
1.0
,
top_k
:
int
=
-
1
,
use_beam_search
:
bool
=
False
,
length_penalty
:
float
=
1.0
,
early_stopping
:
Union
[
bool
,
str
]
=
False
,
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
ignore_eos
:
bool
=
False
,
max_tokens
:
int
=
16
,
logprobs
:
Optional
[
int
]
=
None
,
prompt_logprobs
:
Optional
[
int
]
=
None
,
skip_special_tokens
:
bool
=
True
,
)
->
None
:
self
.
n
=
n
self
.
best_of
=
best_of
if
best_of
is
not
None
else
n
self
.
presence_penalty
=
presence_penalty
self
.
frequency_penalty
=
frequency_penalty
self
.
temperature
=
temperature
self
.
top_p
=
top_p
self
.
top_k
=
top_k
self
.
use_beam_search
=
use_beam_search
self
.
length_penalty
=
length_penalty
self
.
early_stopping
=
early_stopping
if
stop
is
None
:
self
.
stop
=
[]
elif
isinstance
(
stop
,
str
):
self
.
stop
=
[
stop
]
else
:
self
.
stop
=
list
(
stop
)
if
stop_token_ids
is
None
:
self
.
stop_token_ids
=
[]
else
:
self
.
stop_token_ids
=
list
(
stop_token_ids
)
self
.
ignore_eos
=
ignore_eos
self
.
max_tokens
=
max_tokens
self
.
logprobs
=
logprobs
self
.
prompt_logprobs
=
prompt_logprobs
self
.
skip_special_tokens
=
skip_special_tokens
self
.
_verify_args
()
if
self
.
use_beam_search
:
self
.
_verify_beam_search
()
else
:
self
.
_verify_non_beam_search
()
if
self
.
temperature
<
_SAMPLING_EPS
:
# Zero temperature means greedy sampling.
self
.
_verify_greedy_sampling
()
def
_verify_args
(
self
)
->
None
:
if
self
.
n
<
1
:
raise
ValueError
(
f
"n must be at least 1, got
{
self
.
n
}
."
)
if
self
.
best_of
<
self
.
n
:
raise
ValueError
(
f
"best_of must be greater than or equal to n, "
f
"got n=
{
self
.
n
}
and best_of=
{
self
.
best_of
}
."
)
if
not
-
2.0
<=
self
.
presence_penalty
<=
2.0
:
raise
ValueError
(
"presence_penalty must be in [-2, 2], got "
f
"
{
self
.
presence_penalty
}
."
)
if
not
-
2.0
<=
self
.
frequency_penalty
<=
2.0
:
raise
ValueError
(
"frequency_penalty must be in [-2, 2], got "
f
"
{
self
.
frequency_penalty
}
."
)
if
self
.
temperature
<
0.0
:
raise
ValueError
(
f
"temperature must be non-negative, got
{
self
.
temperature
}
."
)
if
not
0.0
<
self
.
top_p
<=
1.0
:
raise
ValueError
(
f
"top_p must be in (0, 1], got
{
self
.
top_p
}
."
)
if
self
.
top_k
<
-
1
or
self
.
top_k
==
0
:
raise
ValueError
(
f
"top_k must be -1 (disable), or at least 1, "
f
"got
{
self
.
top_k
}
."
)
if
self
.
max_tokens
<
1
:
raise
ValueError
(
f
"max_tokens must be at least 1, got
{
self
.
max_tokens
}
."
)
if
self
.
logprobs
is
not
None
and
self
.
logprobs
<
0
:
raise
ValueError
(
f
"logprobs must be non-negative, got
{
self
.
logprobs
}
."
)
if
self
.
prompt_logprobs
is
not
None
and
self
.
prompt_logprobs
<
0
:
raise
ValueError
(
f
"prompt_logprobs must be non-negative, got "
f
"
{
self
.
prompt_logprobs
}
."
)
def
_verify_beam_search
(
self
)
->
None
:
if
self
.
best_of
==
1
:
raise
ValueError
(
"best_of must be greater than 1 when using beam "
f
"search. Got
{
self
.
best_of
}
."
)
if
self
.
temperature
>
_SAMPLING_EPS
:
raise
ValueError
(
"temperature must be 0 when using beam search."
)
if
self
.
top_p
<
1.0
-
_SAMPLING_EPS
:
raise
ValueError
(
"top_p must be 1 when using beam search."
)
if
self
.
top_k
!=
-
1
:
raise
ValueError
(
"top_k must be -1 when using beam search."
)
if
self
.
early_stopping
not
in
[
True
,
False
,
"never"
]:
raise
ValueError
(
f
"early_stopping must be True, False, or 'never', "
f
"got
{
self
.
early_stopping
}
."
)
def
_verify_non_beam_search
(
self
)
->
None
:
if
self
.
early_stopping
is
not
False
:
raise
ValueError
(
"early_stopping is not effective and must be "
"False when not using beam search."
)
if
(
self
.
length_penalty
<
1.0
-
_SAMPLING_EPS
or
self
.
length_penalty
>
1.0
+
_SAMPLING_EPS
):
raise
ValueError
(
"length_penalty is not effective and must be the "
"default value of 1.0 when not using beam search."
)
def
_verify_greedy_sampling
(
self
)
->
None
:
if
self
.
best_of
>
1
:
raise
ValueError
(
"best_of must be 1 when using greedy sampling."
f
"Got
{
self
.
best_of
}
."
)
if
self
.
top_p
<
1.0
-
_SAMPLING_EPS
:
raise
ValueError
(
"top_p must be 1 when using greedy sampling."
)
if
self
.
top_k
!=
-
1
:
raise
ValueError
(
"top_k must be -1 when using greedy sampling."
)
@
cached_property
def
sampling_type
(
self
)
->
SamplingType
:
if
self
.
use_beam_search
:
return
SamplingType
.
BEAM
if
self
.
temperature
<
_SAMPLING_EPS
:
return
SamplingType
.
GREEDY
return
SamplingType
.
RANDOM
def
__repr__
(
self
)
->
str
:
return
(
f
"SamplingParams(n=
{
self
.
n
}
, "
f
"best_of=
{
self
.
best_of
}
, "
f
"presence_penalty=
{
self
.
presence_penalty
}
, "
f
"frequency_penalty=
{
self
.
frequency_penalty
}
, "
f
"temperature=
{
self
.
temperature
}
, "
f
"top_p=
{
self
.
top_p
}
, "
f
"top_k=
{
self
.
top_k
}
, "
f
"use_beam_search=
{
self
.
use_beam_search
}
, "
f
"length_penalty=
{
self
.
length_penalty
}
, "
f
"early_stopping=
{
self
.
early_stopping
}
, "
f
"stop=
{
self
.
stop
}
, "
f
"ignore_eos=
{
self
.
ignore_eos
}
, "
f
"max_tokens=
{
self
.
max_tokens
}
, "
f
"logprobs=
{
self
.
logprobs
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"skip_special_tokens=
{
self
.
skip_special_tokens
}
)"
)
server/vllm/vllm/sequence.py
deleted
100644 → 0
View file @
64def8e2
"""Sequence and its related classes."""
import
copy
import
enum
from
typing
import
Dict
,
List
,
Optional
,
Union
from
vllm.block
import
LogicalTokenBlock
from
vllm.sampling_params
import
SamplingParams
PromptLogprobs
=
List
[
Optional
[
Dict
[
int
,
float
]]]
SampleLogprobs
=
List
[
Dict
[
int
,
float
]]
class
SequenceStatus
(
enum
.
Enum
):
"""Status of a sequence."""
WAITING
=
enum
.
auto
()
RUNNING
=
enum
.
auto
()
SWAPPED
=
enum
.
auto
()
FINISHED_STOPPED
=
enum
.
auto
()
FINISHED_LENGTH_CAPPED
=
enum
.
auto
()
FINISHED_ABORTED
=
enum
.
auto
()
FINISHED_IGNORED
=
enum
.
auto
()
@
staticmethod
def
is_finished
(
status
:
"SequenceStatus"
)
->
bool
:
return
status
in
[
SequenceStatus
.
FINISHED_STOPPED
,
SequenceStatus
.
FINISHED_LENGTH_CAPPED
,
SequenceStatus
.
FINISHED_ABORTED
,
SequenceStatus
.
FINISHED_IGNORED
,
]
@
staticmethod
def
get_finished_reason
(
status
:
"SequenceStatus"
)
->
Union
[
str
,
None
]:
if
status
==
SequenceStatus
.
FINISHED_STOPPED
:
finish_reason
=
"stop"
elif
status
==
SequenceStatus
.
FINISHED_LENGTH_CAPPED
:
finish_reason
=
"length"
elif
status
==
SequenceStatus
.
FINISHED_ABORTED
:
finish_reason
=
"abort"
elif
status
==
SequenceStatus
.
FINISHED_IGNORED
:
# The ignored sequences are the sequences whose prompt lengths
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
finish_reason
=
"length"
else
:
finish_reason
=
None
return
finish_reason
class
SequenceData
:
"""Data associated with a sequence.
Args:
prompt_token_ids: The token IDs of the prompt.
Attributes:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output.
cumulative_logprob: The cumulative log probability of the output.
"""
def
__init__
(
self
,
prompt_token_ids
:
List
[
int
],
)
->
None
:
self
.
prompt_token_ids
=
prompt_token_ids
self
.
output_token_ids
:
List
[
int
]
=
[]
self
.
cumulative_logprob
=
0.0
def
append_token_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
self
.
output_token_ids
.
append
(
token_id
)
self
.
cumulative_logprob
+=
logprob
def
get_len
(
self
)
->
int
:
return
len
(
self
.
output_token_ids
)
+
len
(
self
.
prompt_token_ids
)
def
get_prompt_len
(
self
)
->
int
:
return
len
(
self
.
prompt_token_ids
)
def
get_output_len
(
self
)
->
int
:
return
len
(
self
.
output_token_ids
)
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
prompt_token_ids
+
self
.
output_token_ids
def
get_last_token_id
(
self
)
->
int
:
if
not
self
.
output_token_ids
:
return
self
.
prompt_token_ids
[
-
1
]
return
self
.
output_token_ids
[
-
1
]
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceData("
f
"prompt_token_ids=
{
self
.
prompt_token_ids
}
, "
f
"output_token_ids=
{
self
.
output_token_ids
}
, "
f
"cumulative_logprob=
{
self
.
cumulative_logprob
}
)"
)
class
Sequence
:
"""Stores the data, status, and block information of a sequence.
Args:
seq_id: The ID of the sequence.
prompt: The prompt of the sequence.
prompt_token_ids: The token IDs of the prompt.
block_size: The block size of the sequence. Should be the same as the
block size used by the block manager and cache engine.
"""
def
__init__
(
self
,
seq_id
:
int
,
prompt
:
str
,
prompt_token_ids
:
List
[
int
],
block_size
:
int
,
)
->
None
:
self
.
seq_id
=
seq_id
self
.
prompt
=
prompt
self
.
block_size
=
block_size
self
.
data
=
SequenceData
(
prompt_token_ids
)
self
.
output_logprobs
:
SampleLogprobs
=
[]
self
.
output_text
=
""
self
.
logical_token_blocks
:
List
[
LogicalTokenBlock
]
=
[]
# Initialize the logical token blocks with the prompt token ids.
self
.
_append_tokens_to_blocks
(
prompt_token_ids
)
self
.
status
=
SequenceStatus
.
WAITING
# Used for incremental detokenization
self
.
prefix_offset
=
0
self
.
read_offset
=
0
# Input + output tokens
self
.
tokens
:
Optional
[
List
[
str
]]
=
None
def
_append_logical_block
(
self
)
->
None
:
block
=
LogicalTokenBlock
(
block_number
=
len
(
self
.
logical_token_blocks
),
block_size
=
self
.
block_size
,
)
self
.
logical_token_blocks
.
append
(
block
)
def
_append_tokens_to_blocks
(
self
,
token_ids
:
List
[
int
])
->
None
:
cursor
=
0
while
cursor
<
len
(
token_ids
):
if
not
self
.
logical_token_blocks
:
self
.
_append_logical_block
()
last_block
=
self
.
logical_token_blocks
[
-
1
]
if
last_block
.
is_full
():
self
.
_append_logical_block
()
last_block
=
self
.
logical_token_blocks
[
-
1
]
num_empty_slots
=
last_block
.
get_num_empty_slots
()
last_block
.
append_tokens
(
token_ids
[
cursor
:
cursor
+
num_empty_slots
])
cursor
+=
num_empty_slots
def
append_token_id
(
self
,
token_id
:
int
,
logprobs
:
Dict
[
int
,
float
],
)
->
None
:
assert
token_id
in
logprobs
self
.
_append_tokens_to_blocks
([
token_id
])
self
.
output_logprobs
.
append
(
logprobs
)
self
.
data
.
append_token_id
(
token_id
,
logprobs
[
token_id
])
def
get_len
(
self
)
->
int
:
return
self
.
data
.
get_len
()
def
get_prompt_len
(
self
)
->
int
:
return
self
.
data
.
get_prompt_len
()
def
get_output_len
(
self
)
->
int
:
return
self
.
data
.
get_output_len
()
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
data
.
get_token_ids
()
def
get_last_token_id
(
self
)
->
int
:
return
self
.
data
.
get_last_token_id
()
def
get_output_token_ids
(
self
)
->
List
[
int
]:
return
self
.
data
.
output_token_ids
def
get_cumulative_logprob
(
self
)
->
float
:
return
self
.
data
.
cumulative_logprob
def
get_beam_search_score
(
self
,
length_penalty
:
float
=
0.0
,
seq_len
:
Optional
[
int
]
=
None
,
eos_token_id
:
Optional
[
int
]
=
None
)
->
float
:
"""Calculate the beam search score with length penalty.
Adapted from
https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
"""
if
seq_len
is
None
:
seq_len
=
self
.
get_len
()
# NOTE: HF implementation does not count the EOS token
# towards the length, we align with that here for testing.
if
(
eos_token_id
is
not
None
and
self
.
get_last_token_id
()
==
eos_token_id
):
seq_len
-=
1
return
self
.
get_cumulative_logprob
()
/
(
seq_len
**
length_penalty
)
def
is_finished
(
self
)
->
bool
:
return
SequenceStatus
.
is_finished
(
self
.
status
)
def
fork
(
self
,
new_seq_id
:
int
)
->
"Sequence"
:
new_seq
=
copy
.
deepcopy
(
self
)
new_seq
.
seq_id
=
new_seq_id
return
new_seq
def
__repr__
(
self
)
->
str
:
return
(
f
"Sequence(seq_id=
{
self
.
seq_id
}
, "
f
"status=
{
self
.
status
.
name
}
, "
f
"num_blocks=
{
len
(
self
.
logical_token_blocks
)
}
)"
)
class
SequenceGroup
:
"""A group of sequences that are generated from the same prompt.
Args:
request_id: The ID of the request.
seqs: The list of sequences.
sampling_params: The sampling parameters used to generate the outputs.
arrival_time: The arrival time of the request.
"""
def
__init__
(
self
,
request_id
:
str
,
seqs
:
List
[
Sequence
],
sampling_params
:
SamplingParams
,
arrival_time
:
float
,
)
->
None
:
self
.
request_id
=
request_id
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
sampling_params
=
sampling_params
self
.
arrival_time
=
arrival_time
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
@
property
def
prompt
(
self
)
->
str
:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
())).
prompt
@
property
def
prompt_token_ids
(
self
)
->
List
[
int
]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
())).
data
.
prompt_token_ids
def
get_max_num_running_seqs
(
self
)
->
int
:
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
if
self
.
sampling_params
.
use_beam_search
:
# For beam search, maximally there will always be `best_of` beam
# candidates running in the future.
return
self
.
sampling_params
.
best_of
else
:
if
self
.
sampling_params
.
best_of
>
self
.
num_seqs
():
# At prompt stage, the sequence group is not yet filled up
# and only have one sequence running. However, in the
# generation stage, we will have `best_of` sequences running.
return
self
.
sampling_params
.
best_of
# At sampling stages, return the number of actual sequences
# that are not finished yet.
return
self
.
num_unfinished_seqs
()
def
get_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
,
)
->
List
[
Sequence
]:
if
status
is
None
:
return
list
(
self
.
seqs_dict
.
values
())
else
:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
seq
.
status
==
status
]
def
get_unfinished_seqs
(
self
)
->
List
[
Sequence
]:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
not
seq
.
is_finished
()
]
def
get_finished_seqs
(
self
)
->
List
[
Sequence
]:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
seq
.
is_finished
()]
def
num_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
)
->
int
:
return
len
(
self
.
get_seqs
(
status
))
def
num_unfinished_seqs
(
self
)
->
int
:
return
len
(
self
.
get_unfinished_seqs
())
def
num_finished_seqs
(
self
)
->
int
:
return
len
(
self
.
get_finished_seqs
())
def
find
(
self
,
seq_id
:
int
)
->
Sequence
:
if
seq_id
not
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq_id
}
not found."
)
return
self
.
seqs_dict
[
seq_id
]
def
add
(
self
,
seq
:
Sequence
)
->
None
:
if
seq
.
seq_id
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq
.
seq_id
}
already exists."
)
self
.
seqs_dict
[
seq
.
seq_id
]
=
seq
def
remove
(
self
,
seq_id
:
int
)
->
None
:
if
seq_id
not
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq_id
}
not found."
)
del
self
.
seqs_dict
[
seq_id
]
def
is_finished
(
self
)
->
bool
:
return
all
(
seq
.
is_finished
()
for
seq
in
self
.
get_seqs
())
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceGroup(request_id=
{
self
.
request_id
}
, "
f
"sampling_params=
{
self
.
sampling_params
}
, "
f
"num_seqs=
{
len
(
self
.
seqs_dict
)
}
)"
)
class
SequenceGroupMetadata
:
"""Metadata for a sequence group. Used to create `InputMetadata`.
Args:
request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data)
sampling_params: The sampling parameters used to generate the outputs.
block_tables: The block tables. (Seq id -> list of physical block
numbers)
"""
def
__init__
(
self
,
request_id
:
str
,
is_prompt
:
bool
,
seq_data
:
Dict
[
int
,
SequenceData
],
sampling_params
:
SamplingParams
,
block_tables
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
self
.
request_id
=
request_id
self
.
is_prompt
=
is_prompt
self
.
seq_data
=
seq_data
self
.
sampling_params
=
sampling_params
self
.
block_tables
=
block_tables
class
SequenceOutputs
:
"""The model output associated with a sequence.
Args:
parent_seq_id: The ID of the parent sequence (for forking in beam
search).
output_token: The output token ID.
logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i))
"""
def
__init__
(
self
,
parent_seq_id
:
int
,
output_token
:
int
,
logprobs
:
Dict
[
int
,
float
],
)
->
None
:
self
.
parent_seq_id
=
parent_seq_id
self
.
output_token
=
output_token
self
.
logprobs
=
logprobs
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceOutputs(parent_seq_id=
{
self
.
parent_seq_id
}
, "
f
"output_token=
{
self
.
output_token
}
, "
f
"logprobs=
{
self
.
logprobs
}
)"
)
def
__eq__
(
self
,
other
:
object
)
->
bool
:
if
not
isinstance
(
other
,
SequenceOutputs
):
raise
NotImplementedError
()
return
(
self
.
parent_seq_id
==
other
.
parent_seq_id
and
self
.
output_token
==
other
.
output_token
and
self
.
logprobs
==
other
.
logprobs
)
class
SequenceGroupOutputs
:
"""The model outputs associated with a sequence group."""
def
__init__
(
self
,
samples
:
List
[
SequenceOutputs
],
prompt_logprobs
:
Optional
[
PromptLogprobs
],
)
->
None
:
self
.
samples
=
samples
self
.
prompt_logprobs
=
prompt_logprobs
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceGroupOutputs(samples=
{
self
.
samples
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
)"
)
# For each sequence group, we generate a list of SequenceOutputs object,
# each of which contains one possible candidate for the next token.
SamplerOutput
=
List
[
SequenceGroupOutputs
]
server/vllm/vllm/transformers_utils/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/transformers_utils/config.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Optional
from
transformers
import
AutoConfig
,
PretrainedConfig
from
vllm.transformers_utils.configs
import
*
# pylint: disable=wildcard-import
_CONFIG_REGISTRY
=
{
"mpt"
:
MPTConfig
,
"baichuan"
:
BaiChuanConfig
,
"aquila"
:
AquilaConfig
,
"qwen"
:
QWenConfig
,
"RefinedWeb"
:
RWConfig
,
# For tiiuae/falcon-40b(-instruct)
"RefinedWebModel"
:
RWConfig
,
# For tiiuae/falcon-7b(-instruct)
}
def
get_config
(
model
:
str
,
trust_remote_code
:
bool
,
revision
:
Optional
[
str
]
=
None
)
->
PretrainedConfig
:
try
:
config
=
AutoConfig
.
from_pretrained
(
model
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
)
except
ValueError
as
e
:
if
(
not
trust_remote_code
and
"requires you to execute the configuration file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the model config. If the model is a custom "
"model not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
if
config
.
model_type
in
_CONFIG_REGISTRY
:
config_class
=
_CONFIG_REGISTRY
[
config
.
model_type
]
config
=
config_class
.
from_pretrained
(
model
,
revision
=
revision
)
return
config
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment