Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
FGORGEOUS
minicpm_classify_pytorch
Commits
24eacbc0
Commit
24eacbc0
authored
May 09, 2024
by
chenzk
Browse files
v1.0
parents
Changes
356
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
962 additions
and
0 deletions
+962
-0
inference/vllm/vllm/transformers_utils/configs/__init__.py
inference/vllm/vllm/transformers_utils/configs/__init__.py
+25
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc
...ormers_utils/configs/__pycache__/__init__.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/aquila.cpython-310.pyc
...sformers_utils/configs/__pycache__/aquila.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/baichuan.cpython-310.pyc
...ormers_utils/configs/__pycache__/baichuan.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc
...formers_utils/configs/__pycache__/chatglm.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/cpm.cpython-310.pyc
...ransformers_utils/configs/__pycache__/cpm.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/cpm_mistral.cpython-310.pyc
...ers_utils/configs/__pycache__/cpm_mistral.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc
...sformers_utils/configs/__pycache__/falcon.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc
...ransformers_utils/configs/__pycache__/mpt.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/qwen.cpython-310.pyc
...ansformers_utils/configs/__pycache__/qwen.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/__pycache__/yi.cpython-310.pyc
...transformers_utils/configs/__pycache__/yi.cpython-310.pyc
+0
-0
inference/vllm/vllm/transformers_utils/configs/aquila.py
inference/vllm/vllm/transformers_utils/configs/aquila.py
+69
-0
inference/vllm/vllm/transformers_utils/configs/baichuan.py
inference/vllm/vllm/transformers_utils/configs/baichuan.py
+62
-0
inference/vllm/vllm/transformers_utils/configs/chatglm.py
inference/vllm/vllm/transformers_utils/configs/chatglm.py
+68
-0
inference/vllm/vllm/transformers_utils/configs/cpm.py
inference/vllm/vllm/transformers_utils/configs/cpm.py
+123
-0
inference/vllm/vllm/transformers_utils/configs/cpm_mistral.py
...rence/vllm/vllm/transformers_utils/configs/cpm_mistral.py
+117
-0
inference/vllm/vllm/transformers_utils/configs/cpmmistral.py
inference/vllm/vllm/transformers_utils/configs/cpmmistral.py
+119
-0
inference/vllm/vllm/transformers_utils/configs/falcon.py
inference/vllm/vllm/transformers_utils/configs/falcon.py
+87
-0
inference/vllm/vllm/transformers_utils/configs/mpt.py
inference/vllm/vllm/transformers_utils/configs/mpt.py
+232
-0
inference/vllm/vllm/transformers_utils/configs/qwen.py
inference/vllm/vllm/transformers_utils/configs/qwen.py
+60
-0
No files found.
inference/vllm/vllm/transformers_utils/configs/__init__.py
0 → 100644
View file @
24eacbc0
from
vllm.transformers_utils.configs.aquila
import
AquilaConfig
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.cpm
import
CPMDragonflyConfig
from
vllm.transformers_utils.configs.cpm_mistral
import
CPMMistralConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
from
vllm.transformers_utils.configs.qwen
import
QWenConfig
from
vllm.transformers_utils.configs.yi
import
YiConfig
__all__
=
[
"AquilaConfig"
,
"BaiChuanConfig"
,
"ChatGLMConfig"
,
"CPMDragonflyConfig"
,
"CPMMistralConfig"
,
"MPTConfig"
,
"QWenConfig"
,
"RWConfig"
,
"YiConfig"
,
]
inference/vllm/vllm/transformers_utils/configs/__pycache__/__init__.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/aquila.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/baichuan.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/chatglm.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/cpm.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/cpm_mistral.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/falcon.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/mpt.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/qwen.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/__pycache__/yi.cpython-310.pyc
0 → 100644
View file @
24eacbc0
File added
inference/vllm/vllm/transformers_utils/configs/aquila.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Aquila model configuration"""
from
transformers
import
PretrainedConfig
class
AquilaConfig
(
PretrainedConfig
):
model_type
=
"aquila"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
100008
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.006
,
rms_norm_eps
=
1e-5
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
inference/vllm/vllm/transformers_utils/configs/baichuan.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
transformers.configuration_utils
import
PretrainedConfig
class
BaiChuanConfig
(
PretrainedConfig
):
model_type
=
"baichuan"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
64000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
inference/vllm/vllm/transformers_utils/configs/chatglm.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
from
transformers
import
PretrainedConfig
class
ChatGLMConfig
(
PretrainedConfig
):
model_type
=
"chatglm"
attribute_map
=
{
"num_hidden_layers"
:
"num_layers"
,
"n_head_kv"
:
"multi_query_group_num"
,
}
def
__init__
(
self
,
num_layers
=
28
,
padded_vocab_size
=
65024
,
hidden_size
=
4096
,
ffn_hidden_size
=
13696
,
kv_channels
=
128
,
num_attention_heads
=
32
,
seq_length
=
2048
,
hidden_dropout
=
0.0
,
attention_dropout
=
0.0
,
layernorm_epsilon
=
1e-5
,
rmsnorm
=
True
,
apply_residual_connection_post_layernorm
=
False
,
post_layer_norm
=
True
,
add_bias_linear
=
False
,
add_qkv_bias
=
False
,
interleaved_qkv
=
False
,
bias_dropout_fusion
=
True
,
multi_query_attention
=
False
,
multi_query_group_num
=
1
,
apply_query_key_layer_scaling
=
True
,
attention_softmax_in_fp32
=
True
,
fp32_residual_connection
=
False
,
quantization_bit
=
0
,
pre_seq_len
=
None
,
prefix_projection
=
False
,
**
kwargs
):
self
.
num_layers
=
num_layers
self
.
vocab_size
=
padded_vocab_size
self
.
padded_vocab_size
=
padded_vocab_size
self
.
hidden_size
=
hidden_size
self
.
ffn_hidden_size
=
ffn_hidden_size
self
.
kv_channels
=
kv_channels
self
.
num_attention_heads
=
num_attention_heads
self
.
seq_length
=
seq_length
self
.
hidden_dropout
=
hidden_dropout
self
.
attention_dropout
=
attention_dropout
self
.
layernorm_epsilon
=
layernorm_epsilon
self
.
rmsnorm
=
rmsnorm
self
.
apply_residual_connection_post_layernorm
=
(
apply_residual_connection_post_layernorm
)
self
.
post_layer_norm
=
post_layer_norm
self
.
add_bias_linear
=
add_bias_linear
self
.
add_qkv_bias
=
add_qkv_bias
self
.
bias_dropout_fusion
=
bias_dropout_fusion
self
.
multi_query_attention
=
multi_query_attention
self
.
multi_query_group_num
=
multi_query_group_num
self
.
apply_query_key_layer_scaling
=
apply_query_key_layer_scaling
self
.
attention_softmax_in_fp32
=
attention_softmax_in_fp32
self
.
fp32_residual_connection
=
fp32_residual_connection
self
.
quantization_bit
=
quantization_bit
self
.
pre_seq_len
=
pre_seq_len
self
.
prefix_projection
=
prefix_projection
self
.
interleaved_qkv
=
interleaved_qkv
super
().
__init__
(
**
kwargs
)
inference/vllm/vllm/transformers_utils/configs/cpm.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copyright 2022 The OpenBMB team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
transformers.configuration_utils
import
PretrainedConfig
from
typing_extensions
import
TypedDict
class
CPMDragonflyConfig
(
PretrainedConfig
):
model_type
=
"cpm_dragonfly"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_key_value_heads"
:
"num_kv_heads"
,
"hidden_act"
:
"activate_fn"
,
"hidden_size"
:
"dim_model"
,
"num_attention_heads"
:
"num_heads"
,
"intermediate_size"
:
"dim_ff"
,
"num_hidden_layers"
:
"num_layers"
,
"vocab_size"
:
"vocab_size"
,
"rms_norm_eps"
:
"eps"
,
"scale_emb"
:
"scale_emb"
,
"scale_depth"
:
"scale_depth"
,
"scale"
:
"scale"
,
"attention_scale"
:
"attention_scale"
,
"qk_norm"
:
"qk_norm"
,
"ffn_gated"
:
"ffn_gated"
,
}
# model specific to common
def
__init__
(
self
,
vocab_size
=
32000
,
dim_model
=
4096
,
num_heads
=
32
,
num_kv_heads
=
32
,
dim_head
=
128
,
dim_ff
=
11008
,
num_layers
=
32
,
dropout_p
=
0.0
,
activate_fn
=
"silu"
,
scale
=
True
,
scale_emb
:
float
=
1.
,
scale_depth
:
float
=-
1
,
dim_model_base
:
int
=
None
,
eps
=
1e-5
,
init_std
=
0.02
,
half
:
bool
=
True
,
half_type
=
'bf16'
,
mask_modules
:
Optional
[
List
[
Tuple
[
bool
,
bool
]]]
=
None
,
use_flash_attn
:
bool
=
True
,
flash_attn_mask_shape
=
"1d"
,
flash_impl
=
"cuda"
,
base
=
10000
,
non_checkpointing_layers_num
:
int
=
0
,
attention_scale
=
1
,
qk_norm
=
False
,
ffn_gated
=
True
,
tie_lm_head
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
dim_model
=
dim_model
self
.
num_heads
=
num_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
dim_head
=
dim_head
self
.
dim_ff
=
dim_ff
self
.
num_layers
=
num_layers
self
.
dropout_p
=
dropout_p
self
.
activate_fn
=
activate_fn
self
.
scale
=
scale
self
.
scale_emb
=
scale_emb
self
.
half
=
half
self
.
half_type
=
half_type
self
.
dim_model_base
=
dim_model_base
self
.
scale_depth
=
scale_depth
self
.
eps
=
eps
self
.
init_std
=
init_std
self
.
flash_impl
=
flash_impl
self
.
mask_modules
=
mask_modules
self
.
use_flash_attn
=
use_flash_attn
self
.
flash_attn_mask_shape
=
flash_attn_mask_shape
self
.
base
=
base
self
.
attention_scale
=
attention_scale
self
.
qk_norm
=
qk_norm
self
.
ffn_gated
=
ffn_gated
self
.
non_checkpointing_layers_num
=
non_checkpointing_layers_num
self
.
tie_lm_head
=
tie_lm_head
self
.
use_bfloat16
=
True
if
self
.
half_type
==
'bf16'
else
False
print
(
"gated or not {}, tie or not {}, qk_norm {}"
.
format
(
self
.
ffn_gated
,
self
.
tie_lm_head
,
self
.
qk_norm
))
super
().
__init__
(
architectures
=
[
"CPMDragonflyForCausalLM"
])
@
property
def
scale_width
(
self
,):
if
self
.
scale
:
return
self
.
dim_model
/
self
.
dim_model_base
else
:
return
1.
@
property
def
dtype
(
self
,
):
if
self
.
half
:
if
self
.
half_type
==
'bf16'
:
return
torch
.
bfloat16
else
:
return
torch
.
half
else
:
return
torch
.
float
\ No newline at end of file
inference/vllm/vllm/transformers_utils/configs/cpm_mistral.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copyright 2022 The OpenBMB team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
transformers.configuration_utils
import
PretrainedConfig
from
typing_extensions
import
TypedDict
class
CPMMistralConfig
(
PretrainedConfig
):
model_type
=
"cpm_mistral"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_key_value_heads"
:
"num_kv_heads"
,
"hidden_act"
:
"activate_fn"
,
"hidden_size"
:
"dim_model"
,
"num_attention_heads"
:
"num_heads"
,
"intermediate_size"
:
"dim_ff"
,
"num_hidden_layers"
:
"num_layers"
,
"vocab_size"
:
"vocab_size"
,
"rms_norm_eps"
:
"eps"
,
"scale_emb"
:
"scale_emb"
,
"scale_depth"
:
"scale_depth"
,
"scale"
:
"scale"
,
"attention_scale"
:
"attention_scale"
}
def
__init__
(
self
,
vocab_size
=
32000
,
dim_model
=
4096
,
num_heads
=
32
,
num_kv_heads
=
32
,
dim_head
=
128
,
dim_ff
=
11008
,
num_layers
=
32
,
dropout_p
=
0.0
,
activate_fn
=
"silu"
,
scale
=
True
,
scale_emb
:
float
=
1.
,
scale_depth
:
float
=-
1
,
dim_model_base
:
int
=
None
,
eps
=
1e-5
,
init_std
=
0.02
,
half
:
bool
=
True
,
half_type
=
'bf16'
,
mask_modules
:
Optional
[
List
[
Tuple
[
bool
,
bool
]]]
=
None
,
use_flash_attn
:
bool
=
True
,
flash_attn_mask_shape
=
"1d"
,
flash_impl
=
"cuda"
,
base
=
10000
,
non_checkpointing_layers_num
:
int
=
0
,
attention_scale
=
1
,
max_position_embeddings
=
8192
,
rope_scaling
=
None
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
dim_model
=
dim_model
self
.
num_heads
=
num_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
dim_head
=
dim_head
self
.
dim_ff
=
dim_ff
self
.
num_layers
=
num_layers
self
.
dropout_p
=
dropout_p
self
.
activate_fn
=
activate_fn
self
.
scale
=
scale
self
.
scale_emb
=
scale_emb
self
.
half
=
half
self
.
half_type
=
half_type
self
.
dim_model_base
=
dim_model_base
self
.
scale_depth
=
scale_depth
self
.
eps
=
eps
self
.
init_std
=
init_std
self
.
flash_impl
=
flash_impl
self
.
mask_modules
=
mask_modules
self
.
use_flash_attn
=
use_flash_attn
self
.
flash_attn_mask_shape
=
flash_attn_mask_shape
self
.
base
=
base
self
.
attention_scale
=
attention_scale
self
.
max_position_embeddings
=
max_position_embeddings
self
.
non_checkpointing_layers_num
=
non_checkpointing_layers_num
self
.
rope_scaling
=
rope_scaling
super
().
__init__
(
architectures
=
[
"CPMMistralForCausalLM"
])
@
property
def
scale_width
(
self
,):
if
self
.
scale
:
return
self
.
dim_model
/
self
.
dim_model_base
else
:
return
1.
@
property
def
dtype
(
self
,
):
if
self
.
half
:
if
self
.
half_type
==
'bf16'
:
return
torch
.
bfloat16
else
:
return
torch
.
half
else
:
return
torch
.
float
\ No newline at end of file
inference/vllm/vllm/transformers_utils/configs/cpmmistral.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copyright 2022 The OpenBMB team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
List
from
typing
import
Optional
from
typing
import
Tuple
import
torch
import
torch.nn.functional
as
F
from
typing_extensions
import
TypedDict
from
transformers.configuration_utils
import
PretrainedConfig
class
CPMMistralConfig
(
PretrainedConfig
):
model_type
=
"cpmmistral"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_key_value_heads"
:
"num_kv_heads"
,
"hidden_act"
:
"activate_fn"
,
"hidden_size"
:
"dim_model"
,
"num_attention_heads"
:
"num_heads"
,
"intermediate_size"
:
"dim_ff"
,
"num_hidden_layers"
:
"num_layers"
,
"vocab_size"
:
"vocab_size"
,
"rms_norm_eps"
:
"eps"
,
"scale_emb"
:
"scale_emb"
,
"scale_depth"
:
"scale_depth"
,
"scale"
:
"scale"
,
"attention_scale"
:
"attention_scale"
}
def
__init__
(
self
,
vocab_size
=
32000
,
dim_model
=
4096
,
num_heads
=
32
,
num_kv_heads
=
32
,
dim_head
=
128
,
dim_ff
=
11008
,
num_layers
=
32
,
dropout_p
=
0.0
,
activate_fn
=
"silu"
,
scale
=
True
,
scale_emb
:
float
=
1.
,
scale_depth
:
float
=-
1
,
dim_model_base
:
int
=
None
,
eps
=
1e-5
,
init_std
=
0.02
,
half
:
bool
=
True
,
half_type
=
'bf16'
,
mask_modules
:
Optional
[
List
[
Tuple
[
bool
,
bool
]]]
=
None
,
use_flash_attn
:
bool
=
True
,
flash_attn_mask_shape
=
"1d"
,
flash_impl
=
"cuda"
,
base
=
10000
,
non_checkpointing_layers_num
:
int
=
0
,
attention_scale
=
1
,
max_position_embeddings
=
8192
,
rope_scaling
=
None
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
dim_model
=
dim_model
self
.
num_heads
=
num_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
dim_head
=
dim_head
self
.
dim_ff
=
dim_ff
self
.
num_layers
=
num_layers
self
.
dropout_p
=
dropout_p
self
.
activate_fn
=
activate_fn
self
.
scale
=
scale
self
.
scale_emb
=
scale_emb
self
.
half
=
half
self
.
half_type
=
half_type
self
.
dim_model_base
=
dim_model_base
self
.
scale_depth
=
scale_depth
self
.
eps
=
eps
self
.
init_std
=
init_std
self
.
flash_impl
=
flash_impl
self
.
mask_modules
=
mask_modules
self
.
use_flash_attn
=
use_flash_attn
self
.
flash_attn_mask_shape
=
flash_attn_mask_shape
self
.
base
=
base
self
.
attention_scale
=
attention_scale
self
.
max_position_embeddings
=
max_position_embeddings
self
.
non_checkpointing_layers_num
=
non_checkpointing_layers_num
self
.
rope_scaling
=
rope_scaling
super
().
__init__
(
architectures
=
[
"CPMMistralForCausalLM"
])
@
property
def
scale_width
(
self
,):
if
self
.
scale
:
return
self
.
dim_model
/
self
.
dim_model_base
else
:
return
1.
@
property
def
dtype
(
self
,
):
if
self
.
half
:
if
self
.
half_type
==
'bf16'
:
return
torch
.
bfloat16
else
:
return
torch
.
half
else
:
return
torch
.
float
\ No newline at end of file
inference/vllm/vllm/transformers_utils/configs/falcon.py
0 → 100644
View file @
24eacbc0
# Adapted from
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
# Copyright 2023 The vLLM team.
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
class
RWConfig
(
PretrainedConfig
):
model_type
=
"falcon"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_hidden_layers"
:
"n_layer"
,
"num_attention_heads"
:
"n_head"
,
"num_kv_heads"
:
"n_head_kv"
,
}
def
__init__
(
self
,
vocab_size
=
250880
,
hidden_size
=
64
,
n_layer
=
2
,
n_head
=
8
,
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
use_cache
=
True
,
bos_token_id
=
1
,
eos_token_id
=
2
,
hidden_dropout
=
0.0
,
attention_dropout
=
0.0
,
multi_query
=
True
,
n_head_kv
=
None
,
alibi
=
False
,
bias
=
False
,
parallel_attn
=
False
,
new_decoder_architecture
=
False
,
**
kwargs
,
)
->
None
:
self
.
vocab_size
=
vocab_size
# Backward compatibility with n_embed kwarg
n_embed
=
kwargs
.
pop
(
"n_embed"
,
None
)
self
.
hidden_size
=
hidden_size
if
n_embed
is
None
else
n_embed
self
.
n_layer
=
n_layer
self
.
n_head
=
n_head
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
use_cache
=
use_cache
self
.
hidden_dropout
=
hidden_dropout
self
.
attention_dropout
=
attention_dropout
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
multi_query
=
multi_query
self
.
n_head_kv
=
1
if
n_head_kv
is
None
else
n_head_kv
self
.
alibi
=
alibi
self
.
bias
=
bias
self
.
parallel_attn
=
parallel_attn
self
.
new_decoder_architecture
=
new_decoder_architecture
if
self
.
hidden_size
==
8192
:
# Hack for falcon-40b
self
.
new_decoder_architecture
=
True
super
().
__init__
(
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
**
kwargs
)
@
property
def
head_dim
(
self
):
return
self
.
hidden_size
//
self
.
n_head
@
property
def
rotary
(
self
):
return
not
self
.
alibi
inference/vllm/vllm/transformers_utils/configs/mpt.py
0 → 100644
View file @
24eacbc0
# coding=utf-8
# Copied from
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
"""A HuggingFace-style model configuration."""
import
warnings
from
typing
import
Any
,
Dict
,
Optional
,
Union
from
transformers
import
PretrainedConfig
attn_config_defaults
:
Dict
=
{
'attn_type'
:
'multihead_attention'
,
'attn_pdrop'
:
0.0
,
'attn_impl'
:
'triton'
,
'qk_ln'
:
False
,
'clip_qkv'
:
None
,
'softmax_scale'
:
None
,
'prefix_lm'
:
False
,
'attn_uses_sequence_id'
:
False
,
'alibi'
:
False
,
'alibi_bias_max'
:
8
}
ffn_config_defaults
:
Dict
=
{
'ffn_type'
:
'mptmlp'
}
init_config_defaults
:
Dict
=
{
'name'
:
'kaiming_normal_'
,
'fan_mode'
:
'fan_in'
,
'init_nonlinearity'
:
'relu'
,
'init_div_is_residual'
:
True
,
'emb_init_std'
:
None
,
'emb_init_uniform_lim'
:
None
,
'init_std'
:
None
,
'init_gain'
:
0.0
}
class
MPTConfig
(
PretrainedConfig
):
model_type
=
'mpt'
attribute_map
=
{
'num_attention_heads'
:
'n_heads'
,
'hidden_size'
:
'd_model'
,
'num_hidden_layers'
:
'n_layers'
,
}
# pylint: disable=dangerous-default-value
def
__init__
(
self
,
d_model
:
int
=
2048
,
n_heads
:
int
=
16
,
n_layers
:
int
=
24
,
expansion_ratio
:
int
=
4
,
max_seq_len
:
int
=
2048
,
vocab_size
:
int
=
50368
,
resid_pdrop
:
float
=
0.0
,
emb_pdrop
:
float
=
0.0
,
learned_pos_emb
:
bool
=
True
,
attn_config
:
Dict
=
attn_config_defaults
,
ffn_config
:
Dict
=
ffn_config_defaults
,
init_device
:
str
=
'cpu'
,
logit_scale
:
Optional
[
Union
[
float
,
str
]]
=
None
,
no_bias
:
bool
=
False
,
embedding_fraction
:
float
=
1.0
,
norm_type
:
str
=
'low_precision_layernorm'
,
use_cache
:
bool
=
False
,
init_config
:
Dict
=
init_config_defaults
,
fc_type
:
str
=
'torch'
,
verbose
:
Optional
[
int
]
=
None
,
**
kwargs
:
Any
):
# pylint: disable=line-too-long
"""The MPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
expansion_ratio (int): The ratio of the up/down scale in the ffn.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
learned_pos_emb (bool): Whether to use learned positional embeddings
attn_config (Dict): A dictionary used to configure the model's attention module:
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
attn_pdrop (float): The dropout probability for the attention layers.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
ffn_config (Dict): A dictionary used to configure the model's ffn module:
ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
---
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
"""
self
.
d_model
=
d_model
self
.
n_heads
=
n_heads
self
.
n_layers
=
n_layers
self
.
expansion_ratio
=
expansion_ratio
self
.
max_seq_len
=
max_seq_len
self
.
vocab_size
=
vocab_size
self
.
resid_pdrop
=
resid_pdrop
self
.
emb_pdrop
=
emb_pdrop
self
.
learned_pos_emb
=
learned_pos_emb
self
.
attn_config
=
attn_config
self
.
ffn_config
=
ffn_config
self
.
init_device
=
init_device
self
.
logit_scale
=
logit_scale
self
.
no_bias
=
no_bias
self
.
embedding_fraction
=
embedding_fraction
self
.
norm_type
=
norm_type
self
.
use_cache
=
use_cache
self
.
init_config
=
init_config
self
.
fc_type
=
fc_type
if
verbose
is
not
None
:
warnings
.
warn
(
DeprecationWarning
(
'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
))
if
'name'
in
kwargs
:
del
kwargs
[
'name'
]
if
'loss_fn'
in
kwargs
:
del
kwargs
[
'loss_fn'
]
if
self
.
attn_config
.
get
(
'alibi'
,
False
):
self
.
learned_pos_emb
=
False
warnings
.
warn
(
f
'alibi is turned on, setting `learned_pos_emb` to
{
self
.
learned_pos_emb
}
`'
)
super
().
__init__
(
**
kwargs
)
self
.
_validate_config
()
def
_set_config_defaults
(
self
,
config
:
Dict
[
str
,
Any
],
config_defaults
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]:
for
(
k
,
v
)
in
config_defaults
.
items
():
if
k
not
in
config
:
config
[
k
]
=
v
return
config
def
_validate_config
(
self
)
->
None
:
self
.
attn_config
=
self
.
_set_config_defaults
(
self
.
attn_config
,
attn_config_defaults
)
self
.
ffn_config
=
self
.
_set_config_defaults
(
self
.
ffn_config
,
ffn_config_defaults
)
self
.
init_config
=
self
.
_set_config_defaults
(
self
.
init_config
,
init_config_defaults
)
if
self
.
d_model
%
self
.
n_heads
!=
0
:
raise
ValueError
(
'd_model must be divisible by n_heads'
)
if
any
((
prob
<
0
or
prob
>
1
for
prob
in
[
self
.
attn_config
[
'attn_pdrop'
],
self
.
resid_pdrop
,
self
.
emb_pdrop
]
)):
raise
ValueError
(
"self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"
# pylint: disable=line-too-long
)
if
self
.
attn_config
[
'attn_impl'
]
not
in
[
'torch'
,
'flash'
,
'triton'
]:
raise
ValueError
(
f
"Unknown attn_impl=
{
self
.
attn_config
[
'attn_impl'
]
}
"
)
if
self
.
attn_config
[
'prefix_lm'
]
and
self
.
attn_config
[
'attn_impl'
]
not
in
[
'torch'
,
'triton'
]:
raise
NotImplementedError
(
'prefix_lm only implemented with torch and triton attention.'
)
if
self
.
attn_config
[
'alibi'
]
and
self
.
attn_config
[
'attn_impl'
]
not
in
[
'torch'
,
'triton'
]:
raise
NotImplementedError
(
'alibi only implemented with torch and triton attention.'
)
if
self
.
attn_config
[
'attn_uses_sequence_id'
]
and
self
.
attn_config
[
'attn_impl'
]
not
in
[
'torch'
,
'triton'
]:
raise
NotImplementedError
(
'attn_uses_sequence_id only implemented with torch and triton attention.'
# pylint: disable=line-too-long
)
if
self
.
embedding_fraction
>
1
or
self
.
embedding_fraction
<=
0
:
raise
ValueError
(
'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
# pylint: disable=line-too-long
)
if
isinstance
(
self
.
logit_scale
,
str
)
and
self
.
logit_scale
!=
'inv_sqrt_d_model'
:
raise
ValueError
(
f
"self.logit_scale=
{
self
.
logit_scale
!
r
}
is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
# pylint: disable=line-too-long
)
if
self
.
init_config
.
get
(
'name'
,
None
)
is
None
:
raise
ValueError
(
f
"self.init_config=
{
self
.
init_config
!
r
}
'name' needs to be set."
)
if
not
self
.
learned_pos_emb
and
(
not
self
.
attn_config
[
'alibi'
]):
warnings
.
warn
(
'Positional information not being provided to the model.'
)
if
self
.
fc_type
==
'te'
or
self
.
ffn_config
[
'ffn_type'
]
==
'te_ln_mlp'
:
try
:
# pylint: disable=import-outside-toplevel
import
transformer_engine.pytorch
as
te
del
te
except
Exception
as
exc
:
raise
ImportError
(
# pylint: disable=line-too-long
'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
+
'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:
\n
'
+
'pip install flash-attn==1.0.6 --no-build-isolation
\n
'
+
'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
)
from
exc
if
self
.
ffn_config
[
'ffn_type'
]
==
'mptmlp'
:
self
.
ffn_config
[
'fc_type'
]
=
self
.
fc_type
elif
self
.
ffn_config
[
'ffn_type'
]
==
'te_ln_mlp'
:
self
.
ffn_config
[
'bias'
]
=
not
self
.
no_bias
inference/vllm/vllm/transformers_utils/configs/qwen.py
0 → 100644
View file @
24eacbc0
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
from
transformers
import
PretrainedConfig
class
QWenConfig
(
PretrainedConfig
):
model_type
=
"qwen"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
151936
,
hidden_size
=
4096
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
emb_dropout_prob
=
0.0
,
attn_dropout_prob
=
0.0
,
layer_norm_epsilon
=
1e-6
,
initializer_range
=
0.02
,
max_position_embeddings
=
8192
,
scale_attn_weights
=
True
,
use_cache
=
True
,
bf16
=
False
,
fp16
=
False
,
fp32
=
False
,
kv_channels
=
128
,
rotary_pct
=
1.0
,
rotary_emb_base
=
10000
,
use_dynamic_ntk
=
True
,
use_logn_attn
=
True
,
use_flash_attn
=
"auto"
,
intermediate_size
=
22016
,
no_bias
=
True
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
emb_dropout_prob
=
emb_dropout_prob
self
.
attn_dropout_prob
=
attn_dropout_prob
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
scale_attn_weights
=
scale_attn_weights
self
.
use_cache
=
use_cache
self
.
max_position_embeddings
=
max_position_embeddings
self
.
bf16
=
bf16
self
.
fp16
=
fp16
self
.
fp32
=
fp32
self
.
kv_channels
=
kv_channels
self
.
rotary_pct
=
rotary_pct
self
.
rotary_emb_base
=
rotary_emb_base
self
.
use_dynamic_ntk
=
use_dynamic_ntk
self
.
use_logn_attn
=
use_logn_attn
self
.
use_flash_attn
=
use_flash_attn
self
.
no_bias
=
no_bias
super
().
__init__
(
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
)
Prev
1
…
13
14
15
16
17
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment