Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ac4f685b
Commit
ac4f685b
authored
Jan 06, 2026
by
zhuwenwen
Browse files
remove qiyuan-8b-v2 and FM9GForCausalLM
parent
05e8b083
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
10 additions
and
1309 deletions
+10
-1309
vllm/config/model.py
vllm/config/model.py
+1
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+3
-10
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+1
-4
vllm/model_executor/models/fm9g.py
vllm/model_executor/models/fm9g.py
+0
-592
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+0
-1
vllm/tokenizers/detokenizer_utils.py
vllm/tokenizers/detokenizer_utils.py
+5
-19
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+0
-2
vllm/transformers_utils/configs/fm9g.py
vllm/transformers_utils/configs/fm9g.py
+0
-187
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/__init__.py
+0
-8
vllm/transformers_utils/tokenizers/cpm_9g.py
vllm/transformers_utils/tokenizers/cpm_9g.py
+0
-483
vllm/v1/engine/detokenizer.py
vllm/v1/engine/detokenizer.py
+0
-2
No files found.
vllm/config/model.py
View file @
ac4f685b
...
...
@@ -74,7 +74,7 @@ logger = init_logger(__name__)
RunnerOption
=
Literal
[
"auto"
,
RunnerType
]
ConvertType
=
Literal
[
"none"
,
"embed"
,
"classify"
,
"reward"
]
ConvertOption
=
Literal
[
"auto"
,
ConvertType
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"deepseek_v32"
,
"cpm"
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"deepseek_v32"
]
ModelDType
=
Literal
[
"auto"
,
"half"
,
"float16"
,
"bfloat16"
,
"float"
,
"float32"
]
LogprobsMode
=
Literal
[
"raw_logits"
,
"raw_logprobs"
,
"processed_logits"
,
"processed_logprobs"
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
ac4f685b
...
...
@@ -118,7 +118,6 @@ from vllm.utils.async_utils import (
)
from
vllm.utils.collection_utils
import
is_list_of
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.transformers_utils.tokenizers
import
CPM9GTokenizer
class
GenerationError
(
Exception
):
...
...
@@ -260,9 +259,6 @@ class OpenAIServing:
self
.
io_processor
=
self
.
models
.
io_processor
self
.
model_config
=
self
.
models
.
model_config
self
.
max_model_len
=
self
.
model_config
.
max_model_len
self
.
tokenizer_mode
=
self
.
models
.
model_config
.
tokenizer_mode
if
self
.
models
.
model_config
.
tokenizer_mode
==
"cpm"
:
self
.
tokenizer
=
CPM9GTokenizer
(
self
.
models
.
model_config
.
model
,
trust_remote_code
=
True
)
def
_get_tool_parser
(
self
,
tool_parser_name
:
str
|
None
=
None
,
enable_auto_tools
:
bool
=
False
...
...
@@ -937,11 +933,8 @@ class OpenAIServing:
max_length
=
truncate_prompt_tokens
,
)
if
self
.
tokenizer_mode
==
"cpm"
:
input_ids
=
[
self
.
tokenizer
.
bos_id
]
+
self
.
tokenizer
.
encode
(
prompt
)
else
:
input_ids
=
encoded
.
input_ids
input_ids
=
encoded
.
input_ids
input_text
=
prompt
return
self
.
_validate_input
(
request
,
input_ids
,
input_text
)
...
...
@@ -965,7 +958,7 @@ class OpenAIServing:
input_text
=
""
else
:
async_tokenizer
=
self
.
_get_async_tokenizer
(
tokenizer
)
input_text
=
await
async_tokenizer
.
decode
(
input_ids
)
if
self
.
tokenizer_mode
!=
"cpm"
else
await
self
.
tokenizer
.
decode_all
(
input_ids
)
input_text
=
await
async_tokenizer
.
decode
(
input_ids
)
return
self
.
_validate_input
(
request
,
input_ids
,
input_text
)
...
...
vllm/inputs/preprocess.py
View file @
ac4f685b
...
...
@@ -226,10 +226,7 @@ class InputPreprocessor:
if
encoder_config
and
encoder_config
.
get
(
"do_lower_case"
,
False
):
prompt
=
prompt
.
lower
()
if
self
.
model_config
.
tokenizer_mode
==
"cpm"
:
return
[
tokenizer
.
bos_id
]
+
tokenizer
.
encode
(
prompt
)
else
:
return
tokenizer
.
encode
(
prompt
,
**
tokenization_kwargs
)
return
tokenizer
.
encode
(
prompt
,
**
tokenization_kwargs
)
def
_get_mm_processor
(
self
)
->
BaseMultiModalProcessor
:
if
not
hasattr
(
self
,
"_mm_processor"
):
...
...
vllm/model_executor/models/fm9g.py
deleted
100644 → 0
View file @
05e8b083
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only FM9G model compatible with HuggingFace weights."""
import
math
from
typing
import
Any
,
Dict
,
Iterable
,
Optional
,
Set
,
Tuple
,
Union
,
List
import
torch
from
torch
import
nn
from
vllm.transformers_utils.configs
import
FM9GConfig
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.model_executor.layers.activation
import
FatreluAndMul
,
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
QKVParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
class
FM9GMoE
(
nn
.
Module
):
"""A tensor-parallel MoE implementation that shards each expert
across all ranks.
Each expert's weights are sharded across all ranks and a fused MoE
kernel is used for the forward pass, and finally we reduce the outputs
across ranks.
"""
def
__init__
(
self
,
num_experts
:
int
,
top_k
:
int
,
hidden_size
:
int
,
intermediate_size
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
):
super
().
__init__
()
self
.
tp_size
=
tp_size
or
get_tensor_model_parallel_world_size
()
self
.
num_total_experts
=
num_experts
self
.
top_k
=
top_k
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
//
self
.
tp_size
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
params_dtype
=
params_dtype
self
.
gate
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
num_total_experts
,
bias
=
False
,
params_dtype
=
self
.
params_dtype
,
quant_config
=
None
)
self
.
ws
=
nn
.
Parameter
(
torch
.
empty
(
self
.
num_total_experts
,
2
*
self
.
intermediate_size
,
self
.
hidden_size
,
device
=
current_platform
.
device_type
,
dtype
=
self
.
params_dtype
))
self
.
w2s
=
nn
.
Parameter
(
torch
.
empty
(
self
.
num_total_experts
,
self
.
hidden_size
,
self
.
intermediate_size
,
device
=
current_platform
.
device_type
,
dtype
=
self
.
params_dtype
))
set_weight_attrs
(
self
.
ws
,
{
"weight_loader"
:
self
.
weight_loader
,
})
set_weight_attrs
(
self
.
w2s
,
{
"weight_loader"
:
self
.
weight_loader
,
})
def
weight_loader
(
self
,
param
:
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
,
weight_name
:
str
,
expert_id
:
int
):
tp_rank
=
get_tensor_model_parallel_rank
()
param_data
=
param
.
data
shard_size
=
self
.
intermediate_size
shard
=
slice
(
tp_rank
*
shard_size
,
(
tp_rank
+
1
)
*
shard_size
)
if
weight_name
.
endswith
(
"w1.weight"
):
param_data
[
expert_id
,
0
:
shard_size
,
:]
=
loaded_weight
[
shard
,
:]
if
weight_name
.
endswith
(
"w3.weight"
):
param_data
[
expert_id
,
shard_size
:
2
*
shard_size
,
:]
=
loaded_weight
[
shard
,
:]
if
weight_name
.
endswith
(
"w2.weight"
):
param_data
[
expert_id
,
:,
:]
=
loaded_weight
[:,
shard
]
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
num_tokens
,
hidden_size
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
self
.
hidden_size
)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
final_hidden_states
=
fused_moe
(
hidden_states
,
self
.
ws
,
self
.
w2s
,
router_logits
,
self
.
top_k
,
renormalize
=
True
,
inplace
=
True
)
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
return
final_hidden_states
.
view
(
num_tokens
,
hidden_size
)
class
FM9GMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
hidden_act_param
:
float
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
quant_config
=
quant_config
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
)
if
hidden_act
==
"silu"
:
self
.
act_fn
=
SiluAndMul
()
elif
hidden_act
==
"fatrelu"
:
self
.
act_fn
=
FatreluAndMul
(
threshold
=
hidden_act_param
)
else
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu and fatrelu are supported for now."
)
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
FM9GAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
if
self
.
total_num_kv_heads
>=
tp_size
:
# Number of KV heads is greater than TP size, so we partition
# the KV heads across multiple tensor parallel GPUs.
assert
self
.
total_num_kv_heads
%
tp_size
==
0
else
:
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
self
.
total_num_heads
,
self
.
total_num_kv_heads
,
bias
=
False
,
quant_config
=
quant_config
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
# set rope as fp32 instead of bf16
self
.
rotary_emb
.
cos_sin_cache
=
self
.
rotary_emb
.
_compute_cos_sin_cache
(
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
orig_dtype
=
q
.
dtype
q
,
k
=
q
.
float
(),
k
.
float
()
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
q
,
k
=
q
.
to
(
orig_dtype
),
k
.
to
(
orig_dtype
)
attn_output
=
self
.
attn
(
q
,
k
,
v
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
FM9GDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FM9GConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
self
.
hidden_size
=
config
.
hidden_size
self
.
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
self
.
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
self
.
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
prefix
=
prefix
self
.
_init_attn_block
()
self
.
_init_ffn_block
()
def
_init_attn_block
(
self
):
self
.
input_layernorm
=
RMSNorm
(
self
.
config
.
hidden_size
,
eps
=
self
.
config
.
rms_norm_eps
)
self
.
self_attn
=
FM9GAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
self
.
config
.
num_attention_heads
,
num_kv_heads
=
self
.
config
.
num_key_value_heads
,
rope_theta
=
self
.
rope_theta
,
rope_scaling
=
self
.
rope_scaling
,
max_position_embeddings
=
self
.
max_position_embeddings
,
cache_config
=
self
.
cache_config
,
quant_config
=
self
.
quant_config
,
prefix
=
f
"
{
self
.
prefix
}
.self_attn"
,
)
def
_init_ffn_block
(
self
):
self
.
post_attention_layernorm
=
RMSNorm
(
self
.
config
.
hidden_size
,
eps
=
self
.
config
.
rms_norm_eps
)
self
.
num_experts
=
getattr
(
self
.
config
,
"num_experts"
,
0
)
if
self
.
num_experts
==
0
:
self
.
mlp
=
FM9GMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
self
.
config
.
intermediate_size
,
hidden_act
=
self
.
config
.
hidden_act
,
hidden_act_param
=
getattr
(
self
.
config
,
"hidden_act_param"
,
0.
),
quant_config
=
self
.
quant_config
,
)
else
:
self
.
mlp
=
FM9GMoE
(
num_experts
=
self
.
config
.
num_experts
,
top_k
=
self
.
config
.
num_experts_per_tok
,
hidden_size
=
self
.
config
.
hidden_size
,
intermediate_size
=
self
.
config
.
intermediate_size
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
)
hidden_states
=
residual
+
hidden_states
*
\
(
self
.
config
.
scale_depth
/
math
.
sqrt
(
self
.
config
.
num_hidden_layers
))
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
*
\
(
self
.
config
.
scale_depth
/
math
.
sqrt
(
self
.
config
.
num_hidden_layers
))
return
hidden_states
,
None
@
support_torch_compile
class
FM9GModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
lora_vocab
=
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
num_experts
=
getattr
(
self
.
config
,
"num_experts"
,
0
)
self
.
_init_layers
(
prefix
,
config
,
cache_config
,
quant_config
)
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
make_empty_intermediate_tensors
=
(
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
self
.
config
.
hidden_size
))
def
_init_layers
(
self
,
prefix
:
str
,
config
:
FM9GConfig
,
cache_config
:
Optional
[
CacheConfig
],
quant_config
:
Optional
[
QuantizationConfig
],
):
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
config
.
num_hidden_layers
,
lambda
prefix
:
FM9GDecoderLayer
(
config
,
cache_config
,
quant_config
,
prefix
=
prefix
),
prefix
=
f
"
{
prefix
}
.layers"
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
embedding
=
self
.
embed_tokens
(
input_ids
)
return
embedding
*
self
.
config
.
scale_emb
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
residual
=
None
else
:
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
layer
in
self
.
layers
[
self
.
start_layer
:
self
.
end_layer
]:
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
residual
,
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
,
"residual"
:
residual
})
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
expert_params_mapping
=
[
# (param_name, weight_name, expert_id)
(
"ws"
if
weight_name
in
[
"w1"
,
"w3"
]
else
"w2s"
,
f
"experts.
{
expert_id
}
.
{
weight_name
}
.weight"
,
expert_id
)
for
expert_id
in
range
(
self
.
num_experts
)
for
weight_name
in
[
"w1"
,
"w2"
,
"w3"
]
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
Set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
if
(
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
for
param_name
,
weight_name
,
expert_id
in
expert_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
weight_name
,
expert_id
=
expert_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
class
FM9GForCausalLM
(
nn
.
Module
,
SupportsLoRA
,
SupportsPP
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
,
],
}
# LoRA specific attributes
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
prefix
=
prefix
self
.
vllm_config
=
vllm_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
cache_config
=
cache_config
self
.
quant_config
=
quant_config
self
.
model
=
self
.
_init_model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
)
if
config
.
tie_word_embeddings
:
self
.
lm_head
=
self
.
lm_head
.
tie_weights
(
self
.
model
.
embed_tokens
)
self
.
scale_width
=
self
.
config
.
hidden_size
/
self
.
config
.
dim_model_base
self
.
logits_processor
=
LogitsProcessor
(
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
def
_init_model
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
return
FM9GModel
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
hidden_states
=
hidden_states
/
self
.
scale_width
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
loader
=
AutoWeightsLoader
(
self
,
skip_prefixes
=
([
"lm_head."
]
if
self
.
config
.
tie_word_embeddings
else
None
),
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/registry.py
View file @
ac4f685b
...
...
@@ -95,7 +95,6 @@ _TEXT_GENERATION_MODELS = {
"Ernie4_5_MoeForCausalLM"
:
(
"ernie45_moe"
,
"Ernie4_5_MoeForCausalLM"
),
"ExaoneForCausalLM"
:
(
"exaone"
,
"ExaoneForCausalLM"
),
"Exaone4ForCausalLM"
:
(
"exaone4"
,
"Exaone4ForCausalLM"
),
"FM9GForCausalLM"
:
(
"fm9g"
,
"FM9GForCausalLM"
),
"Fairseq2LlamaForCausalLM"
:
(
"fairseq2_llama"
,
"Fairseq2LlamaForCausalLM"
),
"FalconForCausalLM"
:
(
"falcon"
,
"FalconForCausalLM"
),
"FalconMambaForCausalLM"
:
(
"mamba"
,
"MambaForCausalLM"
),
...
...
vllm/tokenizers/detokenizer_utils.py
View file @
ac4f685b
...
...
@@ -16,7 +16,6 @@ def _convert_tokens_to_string_with_added_encoders(
output_tokens
:
list
[
str
],
skip_special_tokens
:
bool
,
spaces_between_special_tokens
:
bool
,
mode
:
str
,
)
->
str
:
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
...
...
@@ -30,12 +29,9 @@ def _convert_tokens_to_string_with_added_encoders(
current_sub_text
:
list
[
str
]
=
[]
convert_tokens_to_string
=
tokenizer
.
convert_tokens_to_string
added_vocab_set
=
set
(
tokenizer
.
get_added_vocab
())
if
mode
!=
"cpm"
:
all_special_tokens
=
(
set
(
tokenizer
.
all_special_tokens
)
if
skip_special_tokens
else
()
)
else
:
all_special_tokens
=
tokenizer
.
_special_token_set
all_special_tokens
=
(
set
(
tokenizer
.
all_special_tokens
)
if
skip_special_tokens
else
()
)
for
token
in
output_tokens
:
# Use precomputed set for skip-special check
...
...
@@ -49,10 +45,7 @@ def _convert_tokens_to_string_with_added_encoders(
else
:
current_sub_text
.
append
(
token
)
if
current_sub_text
:
if
mode
!=
"cpm"
:
sub_texts
.
append
(
convert_tokens_to_string
(
current_sub_text
))
else
:
sub_texts
=
tokenizer
.
decode
(
current_sub_text
)
sub_texts
.
append
(
convert_tokens_to_string
(
current_sub_text
))
if
spaces_between_special_tokens
:
return
" "
.
join
(
sub_texts
)
return
""
.
join
(
sub_texts
)
...
...
@@ -122,7 +115,6 @@ def detokenize_incrementally(
read_offset
:
int
,
skip_special_tokens
:
bool
=
False
,
spaces_between_special_tokens
:
bool
=
True
,
mode
:
str
=
"cpm"
,
)
->
tuple
[
list
[
str
],
str
,
int
,
int
]:
"""Detokenizes the input ids incrementally and returns the new tokens
and the new text.
...
...
@@ -158,11 +150,7 @@ def detokenize_incrementally(
assert
prev_tokens
is
not
None
# If the new token id is out of bounds, return an empty string.
if
mode
==
"cpm"
:
vocab_size
=
tokenizer
.
vocab_size
else
:
vocab_size
=
len
(
tokenizer
)
if
0
<=
new_token_id
<
vocab_size
:
if
0
<=
new_token_id
<
len
(
tokenizer
):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens
=
tokenizer
.
convert_ids_to_tokens
(
[
new_token_id
],
skip_special_tokens
=
skip_special_tokens
...
...
@@ -191,14 +179,12 @@ def detokenize_incrementally(
output_tokens
[
prefix_offset
:
read_offset
],
skip_special_tokens
=
skip_special_tokens
,
spaces_between_special_tokens
=
spaces_between_special_tokens
,
mode
=
mode
,
)
new_text
=
_convert_tokens_to_string_with_added_encoders
(
tokenizer
,
output_tokens
[
prefix_offset
:],
skip_special_tokens
=
skip_special_tokens
,
spaces_between_special_tokens
=
spaces_between_special_tokens
,
mode
=
mode
,
)
if
len
(
new_text
)
<=
len
(
prefix_text
)
or
new_text
.
endswith
(
"�"
):
...
...
vllm/transformers_utils/configs/__init__.py
View file @
ac4f685b
...
...
@@ -26,7 +26,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"HunYuanVLConfig"
:
"vllm.transformers_utils.configs.hunyuan_vl"
,
"HunYuanVLTextConfig"
:
"vllm.transformers_utils.configs.hunyuan_vl"
,
"HunYuanVLVisionConfig"
:
"vllm.transformers_utils.configs.hunyuan_vl"
,
"FM9GConfig"
:
"vllm.transformers_utils.configs.fm9g"
,
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
...
...
@@ -63,7 +62,6 @@ __all__ = [
"DeepseekV3Config"
,
"DotsOCRConfig"
,
"EAGLEConfig"
,
"FM9GConfig"
,
"FlexOlmoConfig"
,
"HunYuanVLConfig"
,
"HunYuanVLTextConfig"
,
...
...
vllm/transformers_utils/configs/fm9g.py
deleted
100644 → 0
View file @
05e8b083
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""FM9G model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
FM9G_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{}
class
FM9GConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`FM9GModel`]. It is used to instantiate an FM9G
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the FM9G-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the FM9G model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`FM9GModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
"""
model_type
=
"fm9g"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
pretraining_tp
=
1
,
tie_word_embeddings
=
True
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
scale_emb
=
1
,
dim_model_base
=
1
,
scale_depth
=
1
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
_rope_scaling_validation
()
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
scale_emb
=
scale_emb
self
.
dim_model_base
=
dim_model_base
self
.
scale_depth
=
scale_depth
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
try
:
import
flash_attn
self
.
_attn_implementation
=
"flash_attention_2"
except
:
pass
def
_rope_scaling_validation
(
self
):
"""
Validate the `rope_scaling` configuration.
"""
if
self
.
rope_scaling
is
None
:
return
if
not
isinstance
(
self
.
rope_scaling
,
dict
)
or
len
(
self
.
rope_scaling
)
!=
2
:
raise
ValueError
(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f
"got
{
self
.
rope_scaling
}
"
)
rope_scaling_type
=
self
.
rope_scaling
.
get
(
"type"
,
None
)
rope_scaling_factor
=
self
.
rope_scaling
.
get
(
"factor"
,
None
)
if
rope_scaling_type
is
None
or
rope_scaling_type
not
in
[
"linear"
,
"dynamic"
]:
raise
ValueError
(
f
"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got
{
rope_scaling_type
}
"
)
if
rope_scaling_factor
is
None
or
not
isinstance
(
rope_scaling_factor
,
float
)
or
rope_scaling_factor
<=
1.0
:
raise
ValueError
(
f
"`rope_scaling`'s factor field must be a float > 1, got
{
rope_scaling_factor
}
"
)
\ No newline at end of file
vllm/transformers_utils/tokenizers/__init__.py
deleted
100644 → 0
View file @
05e8b083
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.transformers_utils.tokenizers.cpm_9g
import
CPM9GTokenizer
__all__
=
[
"CPM9GTokenizer"
]
vllm/transformers_utils/tokenizers/cpm_9g.py
deleted
100644 → 0
View file @
05e8b083
import
io
import
json
import
os
from
shutil
import
copyfile
from
typing
import
Any
,
Dict
,
IO
,
List
,
Optional
,
Tuple
# import pkg_resources
import
sentencepiece
as
spm
from
pytrie
import
StringTrie
from
transformers.tokenization_utils
import
AddedToken
,
PreTrainedTokenizer
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"vocab.txt"
}
PRETRAINED_VOCAB_FILES_MAP
=
{
"vocab_file"
:
{},
"tokenizer_file"
:
{},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{}
class
CPM9GTokenizer
(
PreTrainedTokenizer
):
"""
CPM9G 分词器类。用于基于字节对编码的分词。
参数:
path (str, 可选): 词汇表文件的路径。
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
]
def
__init__
(
self
,
vocab_file
:
Optional
[
str
]
=
None
,
unk_token
:
str
=
"<unk>"
,
bos_token
:
str
=
"<s>"
,
eos_token
:
str
=
"</s>"
,
pad_token
:
Optional
[
str
]
=
None
,
sp_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
add_bos_token
:
bool
=
True
,
add_eos_token
:
bool
=
False
,
clean_up_tokenization_spaces
:
bool
=
False
,
**
kwargs
,
):
self
.
sp_model_kwargs
=
sp_model_kwargs
or
{}
self
.
vocab_file
=
vocab_file
self
.
add_bos_token
=
add_bos_token
self
.
add_eos_token
=
add_eos_token
self
.
unk_token
=
unk_token
self
.
bos_token
=
bos_token
self
.
eos_token
=
eos_token
self
.
pad_token
=
pad_token
self
.
byte_list
:
List
[
str
]
=
(
[
f
"<0x0
{
hex
(
i
).
upper
()[
2
:]
}
>"
for
i
in
range
(
0x10
)]
+
[
f
"<0x
{
hex
(
i
).
upper
()[
2
:]
}
>"
for
i
in
range
(
0x10
,
0x100
)]
)
self
.
_special_token_set
=
set
([
self
.
unk_token
,
self
.
bos_token
,
self
.
eos_token
]
+
self
.
byte_list
)
if
vocab_file
:
if
'vocab.txt'
not
in
vocab_file
:
all_tokens
=
self
.
load_vocab
(
io
.
FileIO
(
os
.
path
.
join
(
vocab_file
,
VOCAB_FILES_NAMES
[
'vocab_file'
]),
"rb"
))
else
:
all_tokens
=
self
.
load_vocab
(
io
.
FileIO
(
VOCAB_FILES_NAMES
[
'vocab_file'
],
"rb"
))
self
.
encoder
:
Dict
[
str
,
int
]
=
{}
self
.
_special_encoder
:
Dict
[
str
,
int
]
=
{}
for
token
,
token_id
in
all_tokens
.
items
():
if
token
in
self
.
_special_token_set
:
self
.
_special_encoder
[
token
]
=
token_id
else
:
self
.
encoder
[
token
]
=
token_id
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
_byte_decoder
=
{
self
.
_special_encoder
[
token
]:
i
for
i
,
token
in
enumerate
(
self
.
byte_list
)}
self
.
_max_word_len
=
max
([
len
(
x
)
for
x
in
self
.
encoder
.
keys
()])
self
.
_len_word_first
=
{}
for
x
in
self
.
encoder
.
keys
():
if
not
x
[
0
]
in
self
.
_len_word_first
:
self
.
_len_word_first
[
x
[
0
]]
=
1
if
len
(
x
)
>
self
.
_len_word_first
[
x
[
0
]]:
self
.
_len_word_first
[
x
[
0
]]
=
len
(
x
)
self
.
tencoder
=
StringTrie
(
self
.
encoder
)
self
.
_max_token_id
=
self
.
vocab_size
-
1
super
().
__init__
(
bos_token
=
AddedToken
(
bos_token
,
lstrip
=
False
,
rstrip
=
False
),
eos_token
=
AddedToken
(
eos_token
,
lstrip
=
False
,
rstrip
=
False
),
unk_token
=
AddedToken
(
unk_token
,
lstrip
=
False
,
rstrip
=
False
),
pad_token
=
AddedToken
(
pad_token
,
lstrip
=
False
,
rstrip
=
False
)
if
pad_token
else
None
,
add_bos_token
=
add_bos_token
,
add_eos_token
=
add_eos_token
,
sp_model_kwargs
=
self
.
sp_model_kwargs
,
clean_up_tokenization_spaces
=
clean_up_tokenization_spaces
,
**
kwargs
,
)
def
__getstate__
(
self
)
->
Dict
[
str
,
Any
]:
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
:
Dict
[
str
,
Any
])
->
None
:
self
.
__dict__
=
d
def
load_vocab
(
self
,
fp
:
IO
[
bytes
])
->
Dict
[
str
,
int
]:
"""
加载词汇表文件到字典中。
参数:
fp (IO[bytes]): 词汇表文件指针。
返回:
Dict[str, int]: 词汇表字典。
"""
vocab
:
Dict
[
str
,
int
]
=
{}
reader
=
io
.
TextIOWrapper
(
fp
,
encoding
=
"utf-8"
)
for
token
in
reader
.
readlines
():
token
=
token
.
strip
()
if
len
(
token
)
==
0
:
continue
token
=
json
.
loads
(
token
)
vocab
[
token
]
=
len
(
vocab
)
return
vocab
@
property
def
vocab_size
(
self
)
->
int
:
"""返回词汇表大小"""
return
len
(
self
.
encoder
)
+
len
(
self
.
_special_encoder
)
@
property
def
max_token_id
(
self
)
->
int
:
return
self
.
_max_token_id
@
property
def
eos_id
(
self
):
return
self
.
_special_encoder
[
self
.
eos_token
]
@
property
def
bos_id
(
self
):
return
self
.
_special_encoder
[
self
.
bos_token
]
@
property
def
unk_id
(
self
):
return
self
.
_special_encoder
[
self
.
unk_token
]
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]:
"""返回词汇表作为字典"""
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
_tokenize
(
self
,
text
:
str
)
->
List
[
str
]:
"""返回分词后的字符串"""
output_tokens
:
List
[
str
]
=
[]
st
=
0
while
st
<
len
(
text
):
piece
=
self
.
get_piece
(
text
[
st
:])
output_tokens
.
append
(
piece
)
st
+=
len
(
piece
)
return
output_tokens
def
_convert_token_to_id
(
self
,
token
:
str
)
->
int
:
"""使用词汇表将标记(字符串)转换为 id"""
return
self
.
encoder
.
get
(
token
,
self
.
unk_id
)
def
_convert_id_to_token
(
self
,
index
:
int
)
->
str
:
"""使用词汇表将索引(整数)转换为标记(字符串)"""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
"""将标记序列(字符串)转换为单个字符串"""
current_sub_tokens
:
List
[
str
]
=
[]
out_string
=
""
prev_is_special
=
False
for
i
,
token
in
enumerate
(
tokens
):
if
token
in
self
.
_special_token_set
:
if
not
prev_is_special
and
i
!=
0
:
out_string
+=
" "
out_string
+=
self
.
decode
(
current_sub_tokens
)
+
token
prev_is_special
=
True
current_sub_tokens
=
[]
else
:
current_sub_tokens
.
append
(
token
)
prev_is_special
=
False
out_string
+=
self
.
sp_model
.
decode
(
current_sub_tokens
)
return
out_string
def
save_vocabulary
(
self
,
save_directory
:
str
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
"""
保存词汇表和特殊标记文件到目录。
参数:
save_directory (str): 要保存词汇表的目录。
返回:
Tuple[str]: 保存的文件路径。
"""
if
not
os
.
path
.
isdir
(
save_directory
):
raise
ValueError
(
f
"Vocabulary path (
{
save_directory
}
) should be a directory"
)
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
],
)
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
)
and
os
.
path
.
isfile
(
self
.
vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
elif
not
os
.
path
.
isfile
(
self
.
vocab_file
):
with
open
(
out_vocab_file
,
"wb"
)
as
fi
:
fi
.
write
(
self
.
sp_model
.
serialized_model_proto
())
return
(
out_vocab_file
,
)
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
self
.
eos_token_id
]
if
self
.
add_eos_token
else
[]
output
=
bos_token_id
+
token_ids_0
+
eos_token_id
if
token_ids_1
is
not
None
:
output
=
output
+
bos_token_id
+
token_ids_1
+
eos_token_id
return
output
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
获取从未添加特殊标记的标记列表中检索到的序列 id。
在使用分词器的 `prepare_for_model` 方法添加特殊标记时调用此方法。
参数:
token_ids_0 (List[int]): id 列表。
token_ids_1 (List[int], 可选): 序列对的可选第二 id 列表。
already_has_special_tokens (bool, 可选, 默认值为 False):
标记列表是否已使用模型的特殊标记进行格式化。
返回:
List[int]: 一个包含整数(0 或 1)的列表。1 表示特殊标记,0 表示序列标记。
"""
if
already_has_special_tokens
:
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
,
)
bos_token_id
=
[
1
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
1
]
if
self
.
add_eos_token
else
[]
if
token_ids_1
is
None
:
return
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
return
bos_token_id
+
([
0
]
*
len
(
token_ids_0
))
+
eos_token_id
+
bos_token_id
+
([
0
]
*
len
(
token_ids_1
))
+
eos_token_id
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
从传递的两个序列创建掩码,用于序列对分类任务。
参数:
token_ids_0 (List[int]): id 列表。
token_ids_1 (List[int], 可选): 序列对的可选第二 id 列表。
返回:
List[int]: 根据给定序列的标记类型 id 列表。
"""
bos_token_id
=
[
self
.
bos_token_id
]
if
self
.
add_bos_token
else
[]
eos_token_id
=
[
self
.
eos_token_id
]
if
self
.
add_eos_token
else
[]
output
=
[
0
]
*
len
(
bos_token_id
+
token_ids_0
+
eos_token_id
)
if
token_ids_1
is
not
None
:
output
+=
[
1
]
*
len
(
bos_token_id
+
token_ids_1
+
eos_token_id
)
return
output
def
get_piece
(
self
,
text
:
str
)
->
str
:
"""
获取文本中的分词片段。
参数:
text (str): 输入文本。
返回:
str: 分词片段。
"""
if
text
[
0
]
in
self
.
_len_word_first
:
text
=
text
[:
self
.
_len_word_first
[
text
[
0
]]]
len_text
=
len
(
text
)
for
i
in
range
(
len
(
text
)):
sub
=
text
[:
len_text
-
i
]
if
sub
in
self
.
encoder
:
return
sub
return
text
[
0
]
def
encode
(
self
,
text
:
str
)
->
List
[
int
]:
"""
将文本编码为 ID 列表。
参数:
text (str): 输入文本。
返回:
List[int]: 编码后的 ID 列表。
"""
#if len(text) > 20480:
# return [0 for _ in range(20480)]
ret
=
[]
for
x
in
self
.
_tokenize
(
text
):
if
x
in
self
.
encoder
:
ret
.
append
(
self
.
encoder
[
x
])
else
:
ret
.
extend
(
self
.
_encode_unicode
(
x
))
return
ret
def
decode_all
(
self
,
tokens
:
List
[
int
]):
"""Decode ids into a string."""
ret
=
[]
st
=
0
while
st
<
len
(
tokens
):
if
tokens
[
st
]
in
self
.
decoder
:
ret
.
append
(
self
.
decoder
[
tokens
[
st
]])
st
+=
1
elif
tokens
[
st
]
in
self
.
_byte_decoder
:
if
(
st
+
3
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
and
tokens
[
st
+
2
]
in
self
.
_byte_decoder
and
tokens
[
st
+
3
]
in
self
.
_byte_decoder
):
first_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
plane_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
row_id
=
self
.
_byte_decoder
[
tokens
[
st
+
2
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
3
]]
ret
.
append
(
int
.
to_bytes
(
first_id
<<
24
|
plane_id
<<
16
|
row_id
<<
8
|
cell_id
,
4
,
"big"
).
decode
(
"utf-8"
)
)
st
+=
4
elif
(
st
+
2
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
and
tokens
[
st
+
2
]
in
self
.
_byte_decoder
):
plane_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
row_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
2
]]
ret
.
append
(
int
.
to_bytes
(
plane_id
<<
16
|
row_id
<<
8
|
cell_id
,
3
,
"big"
).
decode
(
"utf-8"
))
st
+=
3
elif
st
+
1
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
:
row_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
ret
.
append
(
int
.
to_bytes
(
row_id
<<
8
|
cell_id
,
2
,
"big"
).
decode
(
"utf-8"
))
st
+=
2
else
:
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
ret
.
append
(
int
.
to_bytes
(
cell_id
,
1
,
"big"
).
decode
(
"utf-8"
))
st
+=
1
elif
tokens
[
st
]
==
self
.
eos_id
:
ret
.
append
(
self
.
eos_token
)
st
+=
1
elif
tokens
[
st
]
==
self
.
bos_id
:
ret
.
append
(
self
.
bos_token
)
st
+=
1
else
:
ret
.
append
(
self
.
unk_token
)
st
+=
1
return
""
.
join
(
ret
)
def
decode
(
self
,
tokens
:
List
[
int
])
->
str
:
"""
将 ID 列表解码为字符串。
参数:
tokens (List[int]): ID 列表。
返回:
str: 解码后的字符串。
"""
ret
=
[]
st
=
0
while
st
<
len
(
tokens
):
if
tokens
[
st
]
in
self
.
_byte_decoder
:
if
(
st
+
3
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
and
tokens
[
st
+
2
]
in
self
.
_byte_decoder
and
tokens
[
st
+
3
]
in
self
.
_byte_decoder
):
first_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
plane_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
row_id
=
self
.
_byte_decoder
[
tokens
[
st
+
2
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
3
]]
ret
.
append
(
int
.
to_bytes
(
first_id
<<
24
|
plane_id
<<
16
|
row_id
<<
8
|
cell_id
,
4
,
"big"
).
decode
(
"utf-8"
)
)
st
+=
4
elif
(
st
+
2
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
and
tokens
[
st
+
2
]
in
self
.
_byte_decoder
):
plane_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
row_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
2
]]
ret
.
append
(
int
.
to_bytes
(
plane_id
<<
16
|
row_id
<<
8
|
cell_id
,
3
,
"big"
).
decode
(
"utf-8"
))
st
+=
3
elif
st
+
1
<
len
(
tokens
)
and
tokens
[
st
+
1
]
in
self
.
_byte_decoder
:
row_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
+
1
]]
ret
.
append
(
int
.
to_bytes
(
row_id
<<
8
|
cell_id
,
2
,
"big"
).
decode
(
"utf-8"
))
st
+=
2
else
:
cell_id
=
self
.
_byte_decoder
[
tokens
[
st
]]
ret
.
append
(
int
.
to_bytes
(
cell_id
,
1
,
"big"
).
decode
(
"utf-8"
))
st
+=
1
elif
tokens
[
st
]
==
self
.
eos_id
:
ret
.
append
(
self
.
eos_token
)
st
+=
1
elif
tokens
[
st
]
==
self
.
bos_id
:
ret
.
append
(
self
.
bos_token
)
st
+=
1
else
:
ret
.
append
(
tokens
[
st
])
st
+=
1
#else:
# ret.append(self.unk_token)
# st += 1
return
''
.
join
(
ret
)
def
_encode_unicode
(
self
,
token
:
str
)
->
List
[
int
]:
"""
将 Unicode 编码包装到一个辅助函数中。
参数:
token (str): 要编码的标记。
返回:
List[int]: 编码后的 ID 列表。
"""
ids
=
[]
utf8_id
=
token
.
encode
(
"utf-8"
)
for
_id
in
utf8_id
:
ids
.
append
(
self
.
_special_encoder
[
self
.
byte_list
[
_id
]])
return
ids
def
next_token
(
self
,
text
:
str
)
->
Tuple
[
str
,
List
[
int
]]:
"""
快速获取下一个匹配的标记。
参数:
text (str): 输入文本。
返回:
Tuple[str, List[int]]: 匹配的标记及其 ID 列表。
"""
token
,
token_id
=
self
.
tencoder
.
longest_prefix_item
(
text
,
(
None
,
None
))
if
token
is
None
:
token
=
text
[
0
]
token_ids
=
self
.
_encode_unicode
(
token
)
else
:
token_ids
=
[
token_id
]
return
token
,
token_ids
\ No newline at end of file
vllm/v1/engine/detokenizer.py
View file @
ac4f685b
...
...
@@ -258,7 +258,6 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
class
SlowIncrementalDetokenizer
(
BaseIncrementalDetokenizer
):
def
__init__
(
self
,
tokenizer
:
TokenizerLike
,
request
:
EngineCoreRequest
,
mode
=
"auto"
):
super
().
__init__
(
request
)
self
.
mode
=
mode
self
.
tokenizer
=
tokenizer
params
=
request
.
sampling_params
...
...
@@ -305,7 +304,6 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
read_offset
=
self
.
read_offset
,
skip_special_tokens
=
self
.
skip_special_tokens
,
spaces_between_special_tokens
=
self
.
spaces_between_special_tokens
,
mode
=
self
.
mode
,
)
self
.
tokens
.
extend
(
new_tokens
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment