Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
7e1d5e53
Commit
7e1d5e53
authored
Feb 19, 2024
by
zhuwenwen
Browse files
merge v0.3.1
parents
e3378b20
5f08050d
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
406 additions
and
918 deletions
+406
-918
vllm/model_executor/models/aquila.py
vllm/model_executor/models/aquila.py
+0
-342
vllm/model_executor/models/decilm.py
vllm/model_executor/models/decilm.py
+5
-1
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+3
-12
vllm/model_executor/models/internlm.py
vllm/model_executor/models/internlm.py
+0
-299
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+60
-65
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+36
-8
vllm/model_executor/models/mistral.py
vllm/model_executor/models/mistral.py
+26
-1
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+55
-21
vllm/model_executor/parallel_utils/communication_op.py
vllm/model_executor/parallel_utils/communication_op.py
+9
-4
vllm/model_executor/parallel_utils/cupy_utils.py
vllm/model_executor/parallel_utils/cupy_utils.py
+130
-0
vllm/model_executor/parallel_utils/custom_all_reduce.py
vllm/model_executor/parallel_utils/custom_all_reduce.py
+4
-0
vllm/model_executor/parallel_utils/parallel_state.py
vllm/model_executor/parallel_utils/parallel_state.py
+37
-0
vllm/model_executor/weight_utils.py
vllm/model_executor/weight_utils.py
+14
-15
vllm/sequence.py
vllm/sequence.py
+8
-2
vllm/test_utils.py
vllm/test_utils.py
+5
-2
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+0
-2
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+0
-4
vllm/transformers_utils/configs/aquila.py
vllm/transformers_utils/configs/aquila.py
+0
-69
vllm/transformers_utils/configs/yi.py
vllm/transformers_utils/configs/yi.py
+0
-64
vllm/utils.py
vllm/utils.py
+14
-7
No files found.
vllm/model_executor/models/aquila.py
deleted
100644 → 0
View file @
e3378b20
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only LLaMA model compatible with HuggingFace weights."""
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
ParallelLMHead
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.weight_utils
import
(
default_weight_loader
,
hf_model_weights_iterator
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.aquila
import
AquilaConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
AquilaMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
linear_method
=
linear_method
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
AquilaRMSNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-6
):
"""
AquilaRMSNorm is equivalent to T5LayerNorm
"""
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
hidden_states
):
input_dtype
=
hidden_states
.
dtype
variance
=
hidden_states
.
to
(
torch
.
float32
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
hidden_states
=
hidden_states
*
torch
.
rsqrt
(
variance
+
self
.
variance_epsilon
)
return
(
self
.
weight
*
hidden_states
).
to
(
input_dtype
)
class
AquilaAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
max_position_embeddings
:
int
=
8192
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
assert
self
.
total_num_kv_heads
%
tp_size
==
0
self
.
num_kv_heads
=
self
.
total_num_kv_heads
//
tp_size
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
self
.
total_num_heads
,
self
.
total_num_kv_heads
,
bias
=
False
,
linear_method
=
linear_method
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
AquilaDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
AquilaConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
AquilaAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
config
.
num_key_value_heads
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
rope_scaling
=
rope_scaling
,
linear_method
=
linear_method
,
)
self
.
mlp
=
AquilaMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
linear_method
=
linear_method
,
)
self
.
input_layernorm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
AquilaModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
AquilaConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
AquilaDecoderLayer
(
config
,
linear_method
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
AquilaForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
linear_method
=
linear_method
self
.
model
=
AquilaModel
(
config
,
linear_method
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
)
return
hidden_states
def
sample
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/decilm.py
View file @
7e1d5e53
...
...
@@ -28,6 +28,7 @@ from typing import Optional
import
torch
from
transformers
import
PretrainedConfig
from
vllm.config
import
LoRAConfig
from
vllm.model_executor.layers.linear
import
LinearMethodBase
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.model_executor.weight_utils
import
(
default_weight_loader
,
...
...
@@ -56,10 +57,13 @@ class DeciLMForCausalLM(LlamaForCausalLM):
self
,
config
:
Optional
[
PretrainedConfig
]
=
None
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
config
.
num_key_value_heads
=
max
(
config
.
num_key_value_heads_per_layer
)
delattr
(
config
,
"num_key_value_heads_per_layer"
)
super
().
__init__
(
config
=
config
,
linear_method
=
linear_method
)
super
().
__init__
(
config
=
config
,
linear_method
=
linear_method
,
lora_config
=
lora_config
)
def
load_weights
(
self
,
model_name_or_path
:
str
,
...
...
vllm/model_executor/models/deepseek.py
View file @
7e1d5e53
...
...
@@ -25,7 +25,6 @@ from typing import Any, Dict, List, Optional, Tuple
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
transformers
import
PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
...
...
@@ -155,20 +154,12 @@ class DeepseekMoE(nn.Module):
shared_output
=
self
.
shared_experts
(
hidden_states
)
# router_logits: (batch * sequence_length, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
routing_weights
=
F
.
softmax
(
router_logits
,
dim
=
1
,
dtype
=
torch
.
float
)
routing_weights
,
selected_experts
=
torch
.
topk
(
routing_weights
,
self
.
top_k
,
dim
=-
1
)
if
self
.
config
.
norm_topk_prob
:
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
final_hidden_states
=
fused_moe
(
hidden_states
,
self
.
w1
,
self
.
w2
,
routing_weights
,
selected_experts
,
router_logits
,
self
.
top_k
,
renormalize
=
self
.
config
.
norm_topk_prob
,
inplace
=
True
)
if
self
.
config
.
n_shared_experts
is
not
None
:
...
...
vllm/model_executor/models/internlm.py
deleted
100644 → 0
View file @
e3378b20
# -*- coding: utf-8 -*-
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
LlamaConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
ParallelLMHead
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.weight_utils
import
(
default_weight_loader
,
hf_model_weights_iterator
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
InternLMMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
linear_method
=
linear_method
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
InternLMAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
bias
:
bool
,
rope_theta
:
float
=
10000
,
max_position_embeddings
:
int
=
8192
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
self
.
total_num_heads
,
bias
=
bias
,
linear_method
=
linear_method
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
bias
,
linear_method
=
linear_method
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
InternLMDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
InternLMAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
bias
=
config
.
bias
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
linear_method
=
linear_method
,
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
),
)
self
.
mlp
=
InternLMMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
linear_method
=
linear_method
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Self Attention
if
residual
is
None
:
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
else
:
hidden_states
,
residual
=
self
.
input_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
)
# Fully Connected
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
mlp
(
hidden_states
)
return
hidden_states
,
residual
class
InternLMModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
embed_tokens
=
VocabParallelEmbedding
(
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
InternLMDecoderLayer
(
config
,
linear_method
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
residual
=
None
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
residual
,
)
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
return
hidden_states
class
InternLMForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
linear_method
=
linear_method
self
.
model
=
InternLMModel
(
config
,
linear_method
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
)
return
hidden_states
def
sample
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/
y
i.py
→
vllm/model_executor/models/i
nternlm2
.py
View file @
7e1d5e53
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
# -*- coding: utf-8 -*-
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
vllm.
transformers
_utils.configs.yi
import
Yi
Config
from
transformers
import
Pretrained
Config
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
...
...
@@ -49,7 +27,7 @@ from vllm.sequence import SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
Yi
MLP
(
nn
.
Module
):
class
InternLM2
MLP
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -63,10 +41,10 @@ class YiMLP(nn.Module):
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
linear_method
=
linear_method
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
)
self
.
w2
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
...
...
@@ -75,11 +53,11 @@ class YiMLP(nn.Module):
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
x
,
_
=
self
.
w2
(
x
)
return
x
class
Yi
Attention
(
nn
.
Module
):
class
InternLM2
Attention
(
nn
.
Module
):
def
__init__
(
self
,
...
...
@@ -114,7 +92,7 @@ class YiAttention(nn.Module):
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv
_proj
=
QKVParallelLinear
(
self
.
w
qkv
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
self
.
total_num_heads
,
...
...
@@ -122,17 +100,18 @@ class YiAttention(nn.Module):
bias
=
False
,
linear_method
=
linear_method
,
)
self
.
o
_proj
=
RowParallelLinear
(
self
.
w
o
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
linear_method
=
linear_method
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
base
=
self
.
rope_theta
,
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
...
...
@@ -147,20 +126,20 @@ class YiAttention(nn.Module):
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv
_proj
(
hidden_states
)
qkv
,
_
=
self
.
w
qkv
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
)
output
,
_
=
self
.
o
_proj
(
attn_output
)
output
,
_
=
self
.
w
o
(
attn_output
)
return
output
class
Yi
DecoderLayer
(
nn
.
Module
):
class
InternLM
DecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Yi
Config
,
config
:
Pretrained
Config
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
)
->
None
:
super
().
__init__
()
...
...
@@ -169,7 +148,7 @@ class YiDecoderLayer(nn.Module):
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
Yi
Attention
(
self
.
attention
=
InternLM2
Attention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
config
.
num_key_value_heads
,
...
...
@@ -178,14 +157,15 @@ class YiDecoderLayer(nn.Module):
max_position_embeddings
=
max_position_embeddings
,
linear_method
=
linear_method
,
)
self
.
mlp
=
Yi
MLP
(
self
.
feed_forward
=
InternLM2
MLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
linear_method
=
linear_method
,
)
self
.
ln1
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
ln2
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
attention_norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
ffn_norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
...
...
@@ -198,10 +178,11 @@ class YiDecoderLayer(nn.Module):
# Self Attention
if
residual
is
None
:
residual
=
hidden_states
hidden_states
=
self
.
ln1
(
hidden_states
)
hidden_states
=
self
.
attention_norm
(
hidden_states
)
else
:
hidden_states
,
residual
=
self
.
ln1
(
hidden_states
,
residual
)
hidden_states
=
self
.
self_attn
(
hidden_states
,
residual
=
self
.
attention_norm
(
hidden_states
,
residual
)
hidden_states
=
self
.
attention
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
...
...
@@ -209,28 +190,28 @@ class YiDecoderLayer(nn.Module):
)
# Fully Connected
hidden_states
,
residual
=
self
.
ln2
(
hidden_states
,
residual
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
,
residual
=
self
.
ffn_norm
(
hidden_states
,
residual
)
hidden_states
=
self
.
feed_forward
(
hidden_states
)
return
hidden_states
,
residual
class
Yi
Model
(
nn
.
Module
):
class
InternLM2
Model
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Yi
Config
,
config
:
Pretrained
Config
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
self
.
embed
_token
s
=
VocabParallelEmbedding
(
self
.
tok_
embed
ding
s
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
Yi
DecoderLayer
(
config
,
linear_method
)
InternLM
DecoderLayer
(
config
,
linear_method
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
...
...
@@ -242,7 +223,7 @@ class YiModel(nn.Module):
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed
_token
s
(
input_ids
)
hidden_states
=
self
.
tok_
embed
ding
s
(
input_ids
)
residual
=
None
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
...
...
@@ -257,18 +238,18 @@ class YiModel(nn.Module):
return
hidden_states
class
Yi
ForCausalLM
(
nn
.
Module
):
class
InternLM2
ForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Yi
Config
,
config
:
Pretrained
Config
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
linear_method
=
linear_method
self
.
model
=
Yi
Model
(
config
,
linear_method
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
model
=
InternLM2
Model
(
config
,
linear_method
)
self
.
output
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
...
...
@@ -287,7 +268,7 @@ class YiForCausalLM(nn.Module):
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
next_tokens
=
self
.
sampler
(
self
.
output
.
weight
,
hidden_states
,
sampling_metadata
)
return
next_tokens
...
...
@@ -298,11 +279,8 @@ class YiForCausalLM(nn.Module):
revision
:
Optional
[
str
]
=
None
):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
(
"gate_up_proj"
,
"w1"
,
0
),
(
"gate_up_proj"
,
"w3"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
...
...
@@ -325,6 +303,23 @@ class YiForCausalLM(nn.Module):
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
if
"wqkv"
in
name
:
config
=
self
.
config
kv_groups
=
config
.
num_attention_heads
//
config
.
num_key_value_heads
head_dim
=
config
.
hidden_size
//
config
.
num_attention_heads
loaded_weight
=
loaded_weight
.
view
(
-
1
,
2
+
kv_groups
,
head_dim
,
loaded_weight
.
shape
[
-
1
])
wq
,
wk
,
wv
=
torch
.
split
(
loaded_weight
,
[
kv_groups
,
1
,
1
],
dim
=
1
)
wq
=
wq
.
reshape
(
-
1
,
wq
.
shape
[
-
1
])
wk
=
wk
.
reshape
(
-
1
,
wk
.
shape
[
-
1
])
wv
=
wv
.
reshape
(
-
1
,
wv
.
shape
[
-
1
])
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
wq
,
'q'
)
weight_loader
(
param
,
wk
,
'k'
)
weight_loader
(
param
,
wv
,
'v'
)
else
:
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/llama.py
View file @
7e1d5e53
...
...
@@ -91,6 +91,7 @@ class LlamaAttention(nn.Module):
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
bias
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
...
...
@@ -120,13 +121,13 @@ class LlamaAttention(nn.Module):
self
.
head_dim
,
self
.
total_num_heads
,
self
.
total_num_kv_heads
,
bias
=
False
,
bias
=
bias
,
linear_method
=
linear_method
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
bias
=
bias
,
linear_method
=
linear_method
,
)
...
...
@@ -174,11 +175,13 @@ class LlamaDecoderLayer(nn.Module):
self
.
self_attn
=
LlamaAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
config
.
num_key_value_heads
,
num_kv_heads
=
getattr
(
config
,
"num_key_value_heads"
,
config
.
num_attention_heads
),
rope_theta
=
rope_theta
,
rope_scaling
=
rope_scaling
,
max_position_embeddings
=
max_position_embeddings
,
linear_method
=
linear_method
,
bias
=
getattr
(
config
,
"bias"
,
False
),
)
self
.
mlp
=
LlamaMLP
(
hidden_size
=
self
.
hidden_size
,
...
...
@@ -269,7 +272,32 @@ class LlamaModel(nn.Module):
class
LlamaForCausalLM
(
nn
.
Module
):
supports_lora
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
def
__init__
(
self
,
...
...
@@ -281,11 +309,11 @@ class LlamaForCausalLM(nn.Module):
self
.
config
=
config
self
.
linear_method
=
linear_method
self
.
model
=
LlamaModel
(
config
,
linear_method
,
lora_config
=
lora_config
)
unpadded_vocab_size
=
config
.
vocab_size
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
unpadded_vocab_size
,
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
...
...
@@ -293,7 +321,7 @@ class LlamaForCausalLM(nn.Module):
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
)
self
.
sampler
=
Sampler
(
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
sampler
=
Sampler
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
def
forward
(
self
,
...
...
vllm/model_executor/models/mistral.py
View file @
7e1d5e53
...
...
@@ -265,7 +265,32 @@ class MistralModel(nn.Module):
class
MistralForCausalLM
(
nn
.
Module
):
supports_lora
=
True
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
def
__init__
(
self
,
...
...
vllm/model_executor/models/mixtral.py
View file @
7e1d5e53
...
...
@@ -24,11 +24,10 @@
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
transformers
import
MixtralConfig
from
vllm.config
import
LoRAConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.fused_moe
import
fused_moe
...
...
@@ -40,7 +39,7 @@ from vllm.model_executor.layers.linear import (LinearMethodBase,
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
ParallelLMHead
)
VocabParallelEmbedding
,
ParallelLMHead
,
DEFAULT_VOCAB_PADDING_SIZE
)
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
...
...
@@ -70,13 +69,14 @@ class MixtralMoE(nn.Module):
hidden_size
:
int
,
intermediate_size
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
):
super
().
__init__
()
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
tp_size
=
tp_size
or
get_tensor_model_parallel_world_size
()
self
.
num_total_experts
=
num_experts
self
.
top_k
=
top_k
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
//
tp_size
self
.
intermediate_size
=
intermediate_size
//
self
.
tp_size
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
...
...
@@ -127,22 +127,17 @@ class MixtralMoE(nn.Module):
hidden_states
=
hidden_states
.
view
(
-
1
,
self
.
hidden_size
)
# router_logits: (batch * sequence_length, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
routing_weights
=
F
.
softmax
(
router_logits
,
dim
=
1
,
dtype
=
torch
.
float
)
routing_weights
,
selected_experts
=
torch
.
topk
(
routing_weights
,
self
.
top_k
,
dim
=-
1
)
routing_weights
/=
routing_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
final_hidden_states
=
fused_moe
(
hidden_states
,
self
.
ws
,
self
.
w2s
,
routing_weights
,
selected_experts
,
router_logits
,
self
.
top_k
,
renormalize
=
True
,
inplace
=
True
)
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
return
final_hidden_states
.
view
(
batch_size
,
sequence_length
,
hidden_size
)
...
...
@@ -290,14 +285,19 @@ class MixtralModel(nn.Module):
self
,
config
:
MixtralConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
lora_vocab
=
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
layers
=
nn
.
ModuleList
([
MixtralDecoderLayer
(
config
,
linear_method
=
linear_method
)
...
...
@@ -324,18 +324,52 @@ class MixtralModel(nn.Module):
class
MixtralForCausalLM
(
nn
.
Module
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
def
__init__
(
self
,
config
:
MixtralConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
linear_method
=
linear_method
self
.
model
=
MixtralModel
(
config
,
linear_method
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
self
.
model
=
MixtralModel
(
config
,
linear_method
,
lora_config
=
lora_config
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
)
self
.
sampler
=
Sampler
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
def
forward
(
self
,
...
...
vllm/model_executor/parallel_utils/communication_op.py
View file @
7e1d5e53
from
collections
import
namedtuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Union
from
torch.distributed
import
ProcessGroup
import
torch
from
torch.distributed
import
ProcessGroup
from
vllm.model_executor.parallel_utils
import
cupy_utils
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_group
,
is_cupy_nccl_enabled_for_all_reduce
,
)
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
custom_all_reduce
...
...
@@ -31,8 +32,12 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
out
=
custom_all_reduce
(
input_
)
if
out
is
not
None
:
return
out
torch
.
distributed
.
all_reduce
(
input_
,
group
=
get_tensor_model_parallel_group
())
if
is_cupy_nccl_enabled_for_all_reduce
():
# TODO: support multiple parallel groups.
cupy_utils
.
all_reduce
(
input_
)
else
:
torch
.
distributed
.
all_reduce
(
input_
,
group
=
get_tensor_model_parallel_group
())
return
input_
...
...
vllm/model_executor/parallel_utils/cupy_utils.py
0 → 100644
View file @
7e1d5e53
"""CuPy utilities for all-reduce.
We use CuPy all-reduce instead of torch.distributed.all_reduce when capturing
CUDA graphs, because torch.distributed.all_reduce causes errors when capturing
CUDA graphs.
NOTE: We use CuPy 12.3 since CuPy 13.0 does not support Python 3.8.
TODO: Remove this file when torch.distributed.all_reduce is fixed.
"""
import
contextlib
import
torch
from
torch.distributed
import
ReduceOp
try
:
import
cupy
from
cupy.cuda
import
nccl
from
cupyx.distributed
import
NCCLBackend
except
ImportError
as
e
:
cupy
=
e
nccl
=
None
class
NCCLBackend
:
...
_OP_MAPPING
=
{
ReduceOp
.
SUM
:
"sum"
,
ReduceOp
.
PRODUCT
:
"prod"
,
ReduceOp
.
MIN
:
"min"
,
ReduceOp
.
MAX
:
"max"
,
}
class
NCCLBackendWithBFloat16
(
NCCLBackend
):
# This is enough to add bfloat16 support for most operations,
# but broadcast will fail (will require changes in compiled
# cupy code).
def
_get_nccl_dtype_and_count
(
self
,
array
,
count
=
None
):
nccl_dtype
,
count
=
super
().
_get_nccl_dtype_and_count
(
array
,
count
)
torch_dtype
=
getattr
(
array
,
"_torch_dtype"
,
None
)
if
torch_dtype
is
torch
.
bfloat16
:
nccl_dtype
=
nccl
.
NCCL_BFLOAT16
return
nccl_dtype
,
count
def
barrier
(
self
)
->
None
:
raise
RuntimeError
(
"Currently, CuPy NCCL barrier is not supported since the TCP "
"store is immediately stopped after the initialization."
)
_NCCL_BACKEND
=
None
_WORLD_SIZE
=
0
def
is_initialized
()
->
bool
:
"""Returns whether the NCCL backend is initialized."""
return
_NCCL_BACKEND
is
not
None
@
contextlib
.
contextmanager
def
set_cupy_stream
(
stream
:
torch
.
cuda
.
Stream
):
"""Set the cuda stream for communication"""
cupy_stream
=
cupy
.
cuda
.
ExternalStream
(
stream
.
cuda_stream
,
stream
.
device_index
)
with
cupy_stream
:
yield
def
init_process_group
(
world_size
:
int
,
rank
:
int
,
host
:
str
,
port
:
int
)
->
None
:
"""Initializes the CuPy NCCL backend.
# TODO: handle NCCL timeouts.
"""
assert
not
is_initialized
()
if
isinstance
(
cupy
,
Exception
):
raise
ImportError
(
"NCCLBackend is not available. Please install cupy."
)
from
cupy
# TODO(woosuk): Create TP and PP process groups for CuPy.
global
_NCCL_BACKEND
global
_WORLD_SIZE
assert
world_size
>
0
,
f
"
{
world_size
=
}
should be a positive integer"
assert
0
<=
rank
<
world_size
,
(
f
"
{
rank
=
}
should be a integer between [0,
{
world_size
}
)"
)
cupy
.
cuda
.
runtime
.
setDevice
(
torch
.
cuda
.
current_device
())
_NCCL_BACKEND
=
NCCLBackendWithBFloat16
(
world_size
,
rank
,
host
,
port
)
_WORLD_SIZE
=
world_size
# Stop the TCP store to prevent the deadlock issues at termination time.
# FIXME(woosuk): This is hacky. Find a more robust solution.
if
rank
==
0
and
hasattr
(
_NCCL_BACKEND
,
"_store"
):
_NCCL_BACKEND
.
_store
.
stop
()
def
all_reduce
(
input_
:
torch
.
Tensor
,
op
=
ReduceOp
.
SUM
)
->
None
:
"""All-reduces the input tensor across the process group."""
assert
input_
.
is_cuda
,
f
"
{
input_
}
should be a cuda tensor"
# Hack to support bfloat16
torch_dtype
=
input_
.
dtype
if
torch_dtype
is
torch
.
bfloat16
:
# We need to view as float16, otherwise
# cupy will fail. This will not change
# the underlying data.
input_
=
input_
.
view
(
torch
.
float16
)
cupy_input
=
cupy
.
asarray
(
input_
)
cupy_input
.
_torch_dtype
=
torch_dtype
# pylint: disable=protected-access
_NCCL_BACKEND
.
all_reduce
(
in_array
=
cupy_input
,
out_array
=
cupy_input
,
op
=
_OP_MAPPING
[
op
])
def
destroy_process_group
()
->
None
:
"""Destroys the NCCL backend."""
global
_NCCL_BACKEND
global
_WORLD_SIZE
_NCCL_BACKEND
=
None
_WORLD_SIZE
=
0
def
get_world_size
()
->
int
:
"""Returns the world size."""
return
_WORLD_SIZE
def
get_nccl_backend
():
return
_NCCL_BACKEND
vllm/model_executor/parallel_utils/custom_all_reduce.py
View file @
7e1d5e53
...
...
@@ -67,6 +67,10 @@ def get_handle() -> Optional["CustomAllreduce"]:
return
_CA_HANDLE
def
is_initialized
()
->
bool
:
return
_CA_HANDLE
is
not
None
@
contextmanager
def
capture
():
try
:
...
...
vllm/model_executor/parallel_utils/parallel_state.py
View file @
7e1d5e53
...
...
@@ -3,9 +3,12 @@
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Tensor and pipeline parallel groups."""
import
contextlib
import
torch
from
vllm.model_executor.parallel_utils
import
cupy_utils
# Tensor model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP
=
None
# Pipeline model parallel group that the current rank belongs to.
...
...
@@ -206,3 +209,37 @@ def destroy_model_parallel():
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_GLOBAL_RANKS
_PIPELINE_GLOBAL_RANKS
=
None
# Destroy the cupy states if any.
cupy_utils
.
destroy_process_group
()
# Whether to use cupy for nccl all reduce.
# We use cupy for all reduce when using CUDA graph, because torch.distributed
# is not well supported by CUDA graph.
_ENABLE_CUPY_FOR_ALL_REDUCE
=
False
@
contextlib
.
contextmanager
def
with_cupy_nccl_for_all_reduce
():
"""use CuPy nccl instead of torch.distributed for all reduce"""
tp_size
=
get_tensor_model_parallel_world_size
()
if
tp_size
==
1
:
# No-op.
# NOTE(woosuk): We don't initialize CuPy when tp_size is 1.
yield
else
:
global
_ENABLE_CUPY_FOR_ALL_REDUCE
old
=
_ENABLE_CUPY_FOR_ALL_REDUCE
_ENABLE_CUPY_FOR_ALL_REDUCE
=
True
stream
=
torch
.
cuda
.
current_stream
()
with
cupy_utils
.
set_cupy_stream
(
stream
):
yield
_ENABLE_CUPY_FOR_ALL_REDUCE
=
old
def
is_cupy_nccl_enabled_for_all_reduce
():
"""check if CuPy nccl is enabled for all reduce"""
global
_ENABLE_CUPY_FOR_ALL_REDUCE
return
_ENABLE_CUPY_FOR_ALL_REDUCE
vllm/model_executor/weight_utils.py
View file @
7e1d5e53
...
...
@@ -11,9 +11,9 @@ from huggingface_hub import snapshot_download, HfFileSystem
import
numpy
as
np
from
safetensors.torch
import
load_file
,
save_file
,
safe_open
import
torch
from
transformers
import
PretrainedConfig
from
tqdm.auto
import
tqdm
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
(
get_quantization_config
,
QuantizationConfig
)
...
...
@@ -83,25 +83,22 @@ def convert_bin_to_safetensor_file(
# TODO(woosuk): Move this to other place.
def
get_quant_config
(
quantization
:
str
,
model_name_or_path
:
str
,
hf_config
:
PretrainedConfig
,
cache_dir
:
Optional
[
str
]
=
None
,
)
->
QuantizationConfig
:
quant_cls
=
get_quantization_config
(
quantization
)
def
get_quant_config
(
model_config
:
ModelConfig
)
->
QuantizationConfig
:
quant_cls
=
get_quantization_config
(
model_config
.
quantization
)
# Read the quantization config from the HF model config, if available.
hf_quant_config
=
getattr
(
hf_config
,
"quantization_config"
,
None
)
hf_quant_config
=
getattr
(
model_config
.
hf_config
,
"quantization_config"
,
None
)
if
hf_quant_config
is
not
None
:
return
quant_cls
.
from_config
(
hf_quant_config
)
model_name_or_path
=
model_config
.
model
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
if
not
is_local
:
# Download the config files.
with
get_lock
(
model_name_or_path
,
cache
_dir
):
with
get_lock
(
model_name_or_path
,
model_config
.
download
_dir
):
hf_folder
=
snapshot_download
(
model_name_or_path
,
revision
=
model_config
.
revision
,
allow_patterns
=
"*.json"
,
cache_dir
=
cache
_dir
,
cache_dir
=
model_config
.
download
_dir
,
tqdm_class
=
Disabledtqdm
)
else
:
hf_folder
=
model_name_or_path
...
...
@@ -112,10 +109,12 @@ def get_quant_config(
f
.
endswith
(
x
)
for
x
in
quant_cls
.
get_config_filenames
())
]
if
len
(
quant_config_files
)
==
0
:
raise
ValueError
(
f
"Cannot find the config file for
{
quantization
}
"
)
raise
ValueError
(
f
"Cannot find the config file for
{
model_config
.
quantization
}
"
)
if
len
(
quant_config_files
)
>
1
:
raise
ValueError
(
f
"Found multiple config files for
{
quantization
}
: "
f
"
{
quant_config_files
}
"
)
raise
ValueError
(
f
"Found multiple config files for
{
model_config
.
quantization
}
: "
f
"
{
quant_config_files
}
"
)
quant_config_file
=
quant_config_files
[
0
]
with
open
(
quant_config_file
,
"r"
)
as
f
:
...
...
vllm/sequence.py
View file @
7e1d5e53
...
...
@@ -52,7 +52,6 @@ class SequenceStatus(enum.Enum):
class
SequenceData
:
"""Data associated with a sequence.
Args:
prompt_token_ids: The token IDs of the prompt.
...
...
@@ -197,7 +196,7 @@ class Sequence:
return
self
.
data
.
cumulative_logprob
def
get_beam_search_score
(
self
,
length_penalty
:
float
=
0
.0
,
length_penalty
:
float
=
1
.0
,
seq_len
:
Optional
[
int
]
=
None
,
eos_token_id
:
Optional
[
int
]
=
None
)
->
float
:
"""Calculate the beam search score with length penalty.
...
...
@@ -254,6 +253,7 @@ class SequenceGroup:
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
sampling_params
=
sampling_params
self
.
arrival_time
=
arrival_time
self
.
last_token_time
=
arrival_time
self
.
lora_request
=
lora_request
self
.
prefix
:
Optional
[
Prefix
]
=
prefix
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
...
...
@@ -274,6 +274,12 @@ class SequenceGroup:
def
lora_int_id
(
self
)
->
int
:
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
def
get_last_latency
(
self
,
now
:
float
)
->
float
:
"""Gets last token latency for Request level timings."""
latency
=
now
-
self
.
last_token_time
self
.
last_token_time
=
now
return
latency
def
get_max_num_running_seqs
(
self
)
->
int
:
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
...
...
vllm/test_utils.py
View file @
7e1d5e53
...
...
@@ -15,8 +15,11 @@ def init_test_distributed_environment(
tensor_parallel_size
,
worker_use_ray
=
True
)
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
init_distributed_environment
(
parallel_config
,
rank
,
cupy_port
=
None
,
distributed_init_method
=
distributed_init_method
)
def
multi_process_tensor_parallel
(
...
...
vllm/transformers_utils/config.py
View file @
7e1d5e53
...
...
@@ -5,14 +5,12 @@ from transformers import AutoConfig, PretrainedConfig
from
vllm.transformers_utils.configs
import
*
_CONFIG_REGISTRY
=
{
"aquila"
:
AquilaConfig
,
"baichuan"
:
BaiChuanConfig
,
"chatglm"
:
ChatGLMConfig
,
"mpt"
:
MPTConfig
,
"qwen"
:
QWenConfig
,
"RefinedWeb"
:
RWConfig
,
# For tiiuae/falcon-40b(-instruct)
"RefinedWebModel"
:
RWConfig
,
# For tiiuae/falcon-7b(-instruct)
"yi"
:
YiConfig
,
}
...
...
vllm/transformers_utils/configs/__init__.py
View file @
7e1d5e53
from
vllm.transformers_utils.configs.aquila
import
AquilaConfig
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
...
...
@@ -7,14 +6,11 @@ from vllm.transformers_utils.configs.qwen import QWenConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.yi
import
YiConfig
__all__
=
[
"AquilaConfig"
,
"BaiChuanConfig"
,
"ChatGLMConfig"
,
"MPTConfig"
,
"QWenConfig"
,
"RWConfig"
,
"YiConfig"
,
]
vllm/transformers_utils/configs/aquila.py
deleted
100644 → 0
View file @
e3378b20
# coding=utf-8
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Aquila model configuration"""
from
transformers
import
PretrainedConfig
class
AquilaConfig
(
PretrainedConfig
):
model_type
=
"aquila"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
100008
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.006
,
rms_norm_eps
=
1e-5
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
vllm/transformers_utils/configs/yi.py
deleted
100644 → 0
View file @
e3378b20
""" Yi model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
Yi_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{}
class
YiConfig
(
PretrainedConfig
):
r
"""
Reference:
https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
"""
model_type
=
"Yi"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
64000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
4
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-5
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
output_attentions
=
False
,
rope_theta
=
5000000.0
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
output_attentions
=
output_attentions
self
.
rope_theta
=
rope_theta
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
vllm/utils.py
View file @
7e1d5e53
...
...
@@ -228,7 +228,8 @@ def create_kv_caches_with_random(
device
:
Optional
[
str
]
=
"cuda"
,
)
->
Tuple
[
List
[
torch
.
Tensor
],
List
[
torch
.
Tensor
]]:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
if
isinstance
(
cache_dtype
,
str
):
if
cache_dtype
==
"auto"
:
...
...
@@ -257,10 +258,13 @@ def create_kv_caches_with_random(
key_cache
=
torch
.
empty
(
size
=
key_cache_shape
,
dtype
=
torch_dtype
,
device
=
device
)
if
cache_dtype
in
[
"auto"
,
"half"
,
"bfloat16"
,
"float"
]:
key_cache
.
uniform_
(
-
scale
,
scale
)
elif
cache_dtype
==
'fp8_e5m2'
:
if
cache_dtype
==
'fp8_e5m2'
:
_generate_random_fp8_e5m2
(
key_cache
,
-
scale
,
scale
)
elif
torch_dtype
in
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]:
key_cache
.
uniform_
(
-
scale
,
scale
)
else
:
raise
ValueError
(
f
"Does not support key cache of type
{
cache_dtype
}
"
)
key_caches
.
append
(
key_cache
)
value_cache_shape
=
(
num_blocks
,
num_heads
,
head_size
,
block_size
)
...
...
@@ -269,9 +273,12 @@ def create_kv_caches_with_random(
value_cache
=
torch
.
empty
(
size
=
value_cache_shape
,
dtype
=
torch_dtype
,
device
=
device
)
if
cache_dtype
in
[
"auto"
,
"half"
,
"bfloat16"
,
"float"
]:
value_cache
.
uniform_
(
-
scale
,
scale
)
elif
cache_dtype
==
'fp8_e5m2'
:
if
cache_dtype
==
'fp8_e5m2'
:
_generate_random_fp8_e5m2
(
value_cache
,
-
scale
,
scale
)
elif
torch_dtype
in
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]:
value_cache
.
uniform_
(
-
scale
,
scale
)
else
:
raise
ValueError
(
f
"Does not support value cache of type
{
cache_dtype
}
"
)
value_caches
.
append
(
value_cache
)
return
key_caches
,
value_caches
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment