Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0640f227
Commit
0640f227
authored
Sep 09, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.0' into v0.6.0-dev
parents
82f1ffdf
32e7db25
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
818 additions
and
112 deletions
+818
-112
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/multi_step_worker.py
+3
-2
vllm/spec_decode/ngram_worker.py
vllm/spec_decode/ngram_worker.py
+2
-1
vllm/spec_decode/proposer_worker_base.py
vllm/spec_decode/proposer_worker_base.py
+2
-1
vllm/spec_decode/smaller_tp_proposer_worker.py
vllm/spec_decode/smaller_tp_proposer_worker.py
+2
-1
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+14
-21
vllm/spec_decode/top1_proposer.py
vllm/spec_decode/top1_proposer.py
+3
-3
vllm/spec_decode/util.py
vllm/spec_decode/util.py
+20
-30
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+12
-7
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+2
-0
vllm/transformers_utils/configs/exaone.py
vllm/transformers_utils/configs/exaone.py
+190
-0
vllm/transformers_utils/configs/granite.py
vllm/transformers_utils/configs/granite.py
+199
-0
vllm/transformers_utils/detokenizer.py
vllm/transformers_utils/detokenizer.py
+1
-1
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+60
-38
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/__init__.py
+2
-3
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+175
-0
vllm/transformers_utils/utils.py
vllm/transformers_utils/utils.py
+16
-0
vllm/utils.py
vllm/utils.py
+110
-0
vllm/version.py
vllm/version.py
+1
-1
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+2
-2
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+2
-1
No files found.
vllm/spec_decode/multi_step_worker.py
View file @
0640f227
...
@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple
...
@@ -4,8 +4,9 @@ from typing import Dict, List, Set, Tuple
import
torch
import
torch
from
vllm.sequence
import
(
ExecuteModelRequest
,
HiddenStates
,
SamplerOutput
,
from
vllm.model_executor.layers.sampler
import
SamplerOutput
SequenceData
,
SequenceGroupMetadata
)
from
vllm.sequence
import
(
ExecuteModelRequest
,
HiddenStates
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.spec_decode.draft_model_runner
import
TP1DraftModelRunner
from
vllm.spec_decode.draft_model_runner
import
TP1DraftModelRunner
from
vllm.spec_decode.interfaces
import
(
SpeculativeProposals
,
from
vllm.spec_decode.interfaces
import
(
SpeculativeProposals
,
SpeculativeProposer
)
SpeculativeProposer
)
...
...
vllm/spec_decode/ngram_worker.py
View file @
0640f227
...
@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple
...
@@ -3,7 +3,8 @@ from typing import List, Optional, Set, Tuple
import
torch
import
torch
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
from
vllm.spec_decode.proposer_worker_base
import
NonLLMProposerWorkerBase
from
vllm.spec_decode.proposer_worker_base
import
NonLLMProposerWorkerBase
from
vllm.spec_decode.top1_proposer
import
Top1Proposer
from
vllm.spec_decode.top1_proposer
import
Top1Proposer
...
...
vllm/spec_decode/proposer_worker_base.py
View file @
0640f227
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
typing
import
List
,
Optional
,
Set
,
Tuple
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.spec_decode.interfaces
import
SpeculativeProposer
from
vllm.spec_decode.interfaces
import
SpeculativeProposer
from
vllm.worker.worker_base
import
LoraNotSupportedWorkerBase
from
vllm.worker.worker_base
import
LoraNotSupportedWorkerBase
...
...
vllm/spec_decode/smaller_tp_proposer_worker.py
View file @
0640f227
...
@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group,
...
@@ -6,7 +6,8 @@ from vllm.distributed.parallel_state import (get_tp_group,
init_model_parallel_group
,
init_model_parallel_group
,
patch_tensor_parallel_group
)
patch_tensor_parallel_group
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
from
vllm.spec_decode.multi_step_worker
import
MultiStepWorker
from
vllm.spec_decode.multi_step_worker
import
MultiStepWorker
from
vllm.spec_decode.proposer_worker_base
import
ProposerWorkerBase
from
vllm.spec_decode.proposer_worker_base
import
ProposerWorkerBase
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
0640f227
...
@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig
...
@@ -8,12 +8,13 @@ from vllm.config import ParallelConfig, SpeculativeConfig
from
vllm.distributed.communication_op
import
broadcast_tensor_dict
from
vllm.distributed.communication_op
import
broadcast_tensor_dict
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.rejection_sampler
import
RejectionSampler
from
vllm.model_executor.layers.rejection_sampler
import
RejectionSampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.spec_decode_base_sampler
import
(
from
vllm.model_executor.layers.spec_decode_base_sampler
import
(
SpecDecodeBaseSampler
,
SpecDecodeStochasticBaseSampler
)
SpecDecodeBaseSampler
,
SpecDecodeStochasticBaseSampler
)
from
vllm.model_executor.layers.typical_acceptance_sampler
import
(
from
vllm.model_executor.layers.typical_acceptance_sampler
import
(
TypicalAcceptanceSampler
)
TypicalAcceptanceSampler
)
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
ExecuteModelRequest
,
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
ExecuteModelRequest
,
HiddenStates
,
SamplerOutput
,
SequenceGroupMetadata
,
HiddenStates
,
SequenceGroupMetadata
,
get_all_seq_ids
,
get_all_seq_ids_and_request_ids
)
get_all_seq_ids
,
get_all_seq_ids_and_request_ids
)
from
vllm.spec_decode.batch_expansion
import
BatchExpansionTop1Scorer
from
vllm.spec_decode.batch_expansion
import
BatchExpansionTop1Scorer
from
vllm.spec_decode.draft_model_runner
import
TP1DraftModelRunner
from
vllm.spec_decode.draft_model_runner
import
TP1DraftModelRunner
...
@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
...
@@ -365,12 +366,13 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# used during the prefill phase.
# used during the prefill phase.
# 2. Auto-disable enabled: The running queue size exceeds
# 2. Auto-disable enabled: The running queue size exceeds
# the specified threshold.
# the specified threshold.
# 3. No request: There are no requests in the batch.
# 3. No request: There are no requests in the batch, or
# none of the requests in the batch have spec decoding enabled.
# In any of these cases, the proposer and scorer workers
# In any of these cases, the proposer and scorer workers
# are called normally.
# are called normally.
no_spec
=
num_lookahead_slots
==
0
or
len
(
no_spec
=
num_lookahead_slots
==
0
or
disable_all_speculation
or
all
(
execute_model_req
.
seq_group_metadata_list
sgm
.
num_speculative_tokens
==
0
)
==
0
or
disable_all_speculation
for
sgm
in
execute_model_req
.
seq_group_metadata_list
)
# Broadcast how many lookahead slots are scheduled for this step, and
# Broadcast how many lookahead slots are scheduled for this step, and
# whether all speculation is disabled, to all non-driver workers.
# whether all speculation is disabled, to all non-driver workers.
...
@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
...
@@ -415,10 +417,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self
,
execute_model_req
:
ExecuteModelRequest
)
->
bool
:
self
,
execute_model_req
:
ExecuteModelRequest
)
->
bool
:
# When the batch size is too large, disable speculative decoding
# When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency.
# to stop trading off throughput for latency.
disable_all_speculation
=
(
execute_model_req
.
running_queue_size
>=
return
(
execute_model_req
.
running_queue_size
>=
self
.
disable_by_batch_size
)
self
.
disable_by_batch_size
)
return
disable_all_speculation
def
_maybe_disable_speculative_tokens
(
def
_maybe_disable_speculative_tokens
(
self
,
disable_all_speculation
:
bool
,
self
,
disable_all_speculation
:
bool
,
...
@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
...
@@ -621,18 +621,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
# proposal len. This adds some complexity (splitting the batch into spec
# proposal len. This adds some complexity (splitting the batch into spec
# and non spec sequences) and should be removed in the future. It can be
# and non spec sequences) and should be removed in the future. It can be
# done by supporting per-sequence proposal lens.
# done by supporting per-sequence proposal lens.
_
,
spec_indices
=
split_batch_by_proposal_len
(
(
_
,
spec_indices
),
(
_
,
non_spec_indices
)
=
split_batch_by_proposal_len
(
seq_group_metadata_list
,
seq_group_metadata_list
,
proposal_lens_list
)
proposal_lens_list
,
select_proposal_len_zero
=
False
)
_
,
non_spec_indices
=
split_batch_by_proposal_len
(
seq_group_metadata_list
,
proposal_lens_list
,
select_proposal_len_zero
=
True
)
original_indices
=
spec_indices
+
non_spec_indices
original_indices
=
spec_indices
+
non_spec_indices
# Get probabilities of target model,
ex
cluding bonus token.
# Get probabilities of target model,
in
cluding bonus token
s
.
proposal_verifier_probs
=
proposal_scores
.
probs
[
spec_indices
,
:
-
1
]
proposal_verifier_probs
=
proposal_scores
.
probs
[
spec_indices
]
# Get non-speculative sampled tokens from target model.
# Get non-speculative sampled tokens from target model.
non_spec_token_ids
=
proposal_scores
.
token_ids
[
non_spec_indices
]
non_spec_token_ids
=
proposal_scores
.
token_ids
[
non_spec_indices
]
...
@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
...
@@ -657,13 +651,12 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
}
}
accepted_token_ids
=
self
.
spec_decode_sampler
(
accepted_token_ids
=
self
.
spec_decode_sampler
(
target_probs
=
proposal_verifier_probs
,
target_
with_bonus_
probs
=
proposal_verifier_probs
,
bonus_token_ids
=
bonus_token_ids
,
bonus_token_ids
=
bonus_token_ids
,
draft_probs
=
proposal_probs
,
draft_probs
=
proposal_probs
,
draft_token_ids
=
proposal_token_ids
,
draft_token_ids
=
proposal_token_ids
,
**
sampler_extra_kwargs
,
**
sampler_extra_kwargs
,
)
)
# Append output tokens from non-speculative sequences to
# Append output tokens from non-speculative sequences to
# the accepted token ids tensor.
# the accepted token ids tensor.
non_spec_token_ids
=
non_spec_token_ids
.
expand
(
-
1
,
max_proposal_len
+
non_spec_token_ids
=
non_spec_token_ids
.
expand
(
-
1
,
max_proposal_len
+
...
...
vllm/spec_decode/top1_proposer.py
View file @
0640f227
...
@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple
...
@@ -2,8 +2,8 @@ from typing import List, Optional, Set, Tuple
import
torch
import
torch
from
vllm.
sequence
import
(
ExecuteModelRequest
,
SamplerOutput
,
from
vllm.
model_executor.layers.sampler
import
SamplerOutput
SequenceGroupMetadata
)
from
vllm.sequence
import
ExecuteModelRequest
,
SequenceGroupMetadata
from
vllm.spec_decode.interfaces
import
(
SpeculativeProposals
,
from
vllm.spec_decode.interfaces
import
(
SpeculativeProposals
,
SpeculativeProposer
)
SpeculativeProposer
)
from
vllm.spec_decode.proposer_worker_base
import
ProposerWorkerBase
from
vllm.spec_decode.proposer_worker_base
import
ProposerWorkerBase
...
@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):
...
@@ -138,7 +138,7 @@ class Top1Proposer(SpeculativeProposer):
# Currently only proposal lens of 0 or the global batch proposal len
# Currently only proposal lens of 0 or the global batch proposal len
# are supported.
# are supported.
# If max_proposal_len is defined, then we shall no exceed this
# If max_proposal_len is defined, then we shall no
t
exceed this
# quota for nonzero_proposal
# quota for nonzero_proposal
new_k
=
0
new_k
=
0
if
(
self
.
max_proposal_len
is
None
if
(
self
.
max_proposal_len
is
None
...
...
vllm/spec_decode/util.py
View file @
0640f227
import
time
import
time
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
import
torch
import
torch
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
Logprob
,
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
Logprob
,
SamplerOutput
,
SequenceGroupMetadata
,
SequenceGroupMetadata
,
SequenceOutput
)
SequenceOutput
)
SeqId
=
int
SeqId
=
int
...
@@ -43,8 +43,8 @@ def get_sampled_token_logprobs(
...
@@ -43,8 +43,8 @@ def get_sampled_token_logprobs(
sampled_token_ids
,
]
sampled_token_ids
,
]
expanded_selected_logprobs
=
selected_logprobs
.
unsqueeze
(
-
1
).
expand
(
expanded_selected_logprobs
=
selected_logprobs
.
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
vocab_size
)
-
1
,
-
1
,
vocab_size
)
sampled_token_ids_ranks
=
(
logprob_tensor
>
=
sampled_token_ids_ranks
=
(
logprob_tensor
>
expanded_selected_logprobs
).
sum
(
-
1
)
expanded_selected_logprobs
).
sum
(
-
1
)
.
add_
(
1
)
return
sampled_token_ids_ranks
,
selected_logprobs
return
sampled_token_ids_ranks
,
selected_logprobs
...
@@ -98,33 +98,26 @@ def create_sequence_group_output(
...
@@ -98,33 +98,26 @@ def create_sequence_group_output(
def
split_batch_by_proposal_len
(
def
split_batch_by_proposal_len
(
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
proposal_lens
:
List
[
int
],
select_proposal_len_zero
:
bool
proposal_lens
:
List
[
int
],
)
->
Tuple
[
List
[
SequenceGroupMetadata
],
List
[
int
]]:
)
->
Tuple
[
Tuple
[
List
[
SequenceGroupMetadata
],
List
[
int
]],
Tuple
[
List
[
SequenceGroupMetadata
],
List
[
int
]]]:
"""Utility function that splits a batch based on whether the proposal len is
"""Utility function that splits a batch based on whether the proposal len is
zero or not. We should remove this once vLLM supports per-sequence proposal
zero or not. We should remove this once vLLM supports per-sequence proposal
lens in a batch.
lens in a batch.
"""
"""
if
select_proposal_len_zero
:
nonzero_lists
:
Tuple
[
List
[
SequenceGroupMetadata
],
List
[
int
]]
=
([],
[])
predicate
=
lambda
proposal_len
:
proposal_len
==
0
zero_lists
:
Tuple
[
List
[
SequenceGroupMetadata
],
List
[
int
]]
=
([],
[])
else
:
for
i
,
(
seq_group
,
proposal_len
)
in
enumerate
(
predicate
=
lambda
proposal_len
:
proposal_len
!=
0
zip
(
seq_group_metadata_list
,
proposal_lens
)):
seq_groups
,
indices
=
nonzero_lists
if
proposal_len
else
zero_lists
indices
=
[
seq_groups
.
append
(
seq_group
)
i
for
i
,
(
_
,
proposal_len
indices
.
append
(
i
)
)
in
enumerate
(
zip
(
seq_group_metadata_list
,
proposal_lens
))
return
nonzero_lists
,
zero_lists
if
predicate
(
proposal_len
)
]
seq_groups
=
[
seq_group
for
seq_group
,
proposal_len
in
zip
(
seq_group_metadata_list
,
proposal_lens
)
if
predicate
(
proposal_len
)
]
return
seq_groups
,
indices
def
sampler_output_to_torch
(
def
sampler_output_to_torch
(
sampler_output_list
:
List
[
SamplerOutput
],
sampler_transposed
:
bool
sampler_output_list
:
Sequence
[
SamplerOutput
],
sampler_transposed
:
bool
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]]:
"""Utility function which converts a list of SamplerOutput to tensors.
"""Utility function which converts a list of SamplerOutput to tensors.
...
@@ -148,18 +141,12 @@ def sampler_output_to_torch(
...
@@ -148,18 +141,12 @@ def sampler_output_to_torch(
dim
=
0
,
dim
=
0
,
)
)
if
sampler_transposed
:
sampled_token_probs
=
sampled_token_probs
.
transpose
(
0
,
1
)
# shape: [batch_size, num_sampler_output, vocab_size]
# shape: [batch_size, num_sampler_output, vocab_size]
sampled_token_logprobs
=
torch
.
stack
(
sampled_token_logprobs
=
torch
.
stack
(
[
sampler_output
.
logprobs
for
sampler_output
in
sampler_output_list
],
[
sampler_output
.
logprobs
for
sampler_output
in
sampler_output_list
],
dim
=
0
,
dim
=
0
,
)
)
if
sampler_transposed
:
sampled_token_logprobs
=
sampled_token_logprobs
.
transpose
(
0
,
1
)
# shape: [batch_size, num_sampler_output]
# shape: [batch_size, num_sampler_output]
sampled_token_ids
=
torch
.
stack
(
sampled_token_ids
=
torch
.
stack
(
[
[
...
@@ -168,7 +155,10 @@ def sampler_output_to_torch(
...
@@ -168,7 +155,10 @@ def sampler_output_to_torch(
],
],
dim
=
0
,
dim
=
0
,
)
)
if
sampler_transposed
:
if
sampler_transposed
:
sampled_token_probs
=
sampled_token_probs
.
transpose
(
0
,
1
)
sampled_token_logprobs
=
sampled_token_logprobs
.
transpose
(
0
,
1
)
sampled_token_ids
=
sampled_token_ids
.
transpose
(
0
,
1
)
sampled_token_ids
=
sampled_token_ids
.
transpose
(
0
,
1
)
if
sampler_output_list
[
0
].
hidden_states
is
not
None
:
if
sampler_output_list
[
0
].
hidden_states
is
not
None
:
...
...
vllm/transformers_utils/config.py
View file @
0640f227
...
@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import (
...
@@ -11,11 +11,12 @@ from transformers.models.auto.modeling_auto import (
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.configs
import
(
ChatGLMConfig
,
DbrxConfig
,
from
vllm.transformers_utils.configs
import
(
ChatGLMConfig
,
DbrxConfig
,
EAGLEConfig
,
InternVLChatConfig
,
EAGLEConfig
,
ExaoneConfig
,
JAISConfig
,
MedusaConfig
,
InternVLChatConfig
,
JAISConfig
,
MLPSpeculatorConfig
,
MPTConfig
,
MedusaConfig
,
MLPSpeculatorConfig
,
NemotronConfig
,
RWConfig
,
MPTConfig
,
NemotronConfig
,
UltravoxConfig
)
RWConfig
,
UltravoxConfig
)
from
vllm.transformers_utils.utils
import
check_gguf_file
if
VLLM_USE_MODELSCOPE
:
if
VLLM_USE_MODELSCOPE
:
from
modelscope
import
AutoConfig
from
modelscope
import
AutoConfig
...
@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
...
@@ -34,6 +35,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"mlp_speculator"
:
MLPSpeculatorConfig
,
"mlp_speculator"
:
MLPSpeculatorConfig
,
"medusa"
:
MedusaConfig
,
"medusa"
:
MedusaConfig
,
"eagle"
:
EAGLEConfig
,
"eagle"
:
EAGLEConfig
,
"exaone"
:
ExaoneConfig
,
"internvl_chat"
:
InternVLChatConfig
,
"internvl_chat"
:
InternVLChatConfig
,
"nemotron"
:
NemotronConfig
,
"nemotron"
:
NemotronConfig
,
"ultravox"
:
UltravoxConfig
,
"ultravox"
:
UltravoxConfig
,
...
@@ -55,7 +57,7 @@ def get_config(
...
@@ -55,7 +57,7 @@ def get_config(
)
->
PretrainedConfig
:
)
->
PretrainedConfig
:
# Separate model folder from file path for GGUF models
# Separate model folder from file path for GGUF models
is_gguf
=
Path
(
model
).
is_file
()
and
Path
(
model
).
suffix
==
".gguf"
is_gguf
=
check_gguf_file
(
model
)
if
is_gguf
:
if
is_gguf
:
kwargs
[
"gguf_file"
]
=
Path
(
model
).
name
kwargs
[
"gguf_file"
]
=
Path
(
model
).
name
model
=
Path
(
model
).
parent
model
=
Path
(
model
).
parent
...
@@ -107,8 +109,11 @@ def get_hf_image_processor_config(
...
@@ -107,8 +109,11 @@ def get_hf_image_processor_config(
revision
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
Dict
[
str
,
Any
]:
)
->
Dict
[
str
,
Any
]:
# ModelScope does not provide an interface for image_processor
if
VLLM_USE_MODELSCOPE
:
return
dict
()
# Separate model folder from file path for GGUF models
# Separate model folder from file path for GGUF models
if
Path
(
model
).
is_file
()
and
Path
(
model
).
suffix
==
".gguf"
:
if
check_gguf_file
(
model
)
:
model
=
Path
(
model
).
parent
model
=
Path
(
model
).
parent
return
get_image_processor_config
(
model
,
revision
=
revision
,
**
kwargs
)
return
get_image_processor_config
(
model
,
revision
=
revision
,
**
kwargs
)
...
...
vllm/transformers_utils/configs/__init__.py
View file @
0640f227
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.dbrx
import
DbrxConfig
from
vllm.transformers_utils.configs.dbrx
import
DbrxConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
from
vllm.transformers_utils.configs.exaone
import
ExaoneConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
# `FalconConfig` class from the official HuggingFace transformers library.
...
@@ -22,6 +23,7 @@ __all__ = [
...
@@ -22,6 +23,7 @@ __all__ = [
"JAISConfig"
,
"JAISConfig"
,
"MedusaConfig"
,
"MedusaConfig"
,
"EAGLEConfig"
,
"EAGLEConfig"
,
"ExaoneConfig"
,
"MLPSpeculatorConfig"
,
"MLPSpeculatorConfig"
,
"NemotronConfig"
,
"NemotronConfig"
,
"UltravoxConfig"
,
"UltravoxConfig"
,
...
...
vllm/transformers_utils/configs/exaone.py
0 → 100644
View file @
0640f227
# coding=utf-8
# Copied from
# https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py
# Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exaone model configuration"""
from
typing
import
Dict
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP
:
Dict
[
str
,
str
]
=
{}
class
ExaoneConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a :class:
`~transformers.ExaoneModel`. It is used to instantiate a GPT Lingvo model
according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Exaone
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
and can be used to control the model outputs. Read the documentation from :
class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT Lingvo model. Defines the number of
different tokens that can be represented by the :obj:`inputs_ids`
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
size of the model.
Defines the different tokens that can be represented by the
`inputs_ids` passed to the forward method of :class:
`~transformers.EXAONEModel`.
hidden_size (:obj:`int`, `optional`, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
num_layers (:obj:`int`, `optional`, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1 the model will use
Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint,
each group key and value head should be constructed by meanpooling
all the original heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
rotary_pct (`float`, *optional*, defaults to 0.25):
percentage of hidden dimensions to allocate to rotary embeddings
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
the Transformer encoder.
activation_function (:obj:`str` or :obj:`function`, `optional`,
defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the
embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling
:class:`~transformers.EXAONEModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
Only relevant if ``config.is_decoder=True``.
gradient_checkpointing (:obj:`bool`, `optional`,
defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense
of slower backward pass.
Example::
>>> from transformers import ExoneModel, ExaoneConfig
>>> # Initializing a EXAONE configuration
>>> configuration = ExaoneConfig()
>>> # Initializing a model from configuration
>>> model = ExoneModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type
=
"exaone"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_hidden_layers"
:
"num_layers"
}
def
__init__
(
self
,
vocab_size
=
102400
,
max_position_embeddings
=
2048
,
hidden_size
=
2048
,
num_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
intermediate_size
=
None
,
activation_function
=
"silu"
,
rotary_pct
=
0.25
,
resid_dropout
=
0.0
,
embed_dropout
=
0.0
,
attention_dropout
=
0.0
,
layer_norm_epsilon
=
1e-6
,
initializer_range
=
0.02
,
use_cache
=
True
,
bos_token_id
=
0
,
eos_token_id
=
2
,
tie_word_embeddings
=
True
,
**
kwargs
,
):
super
().
__init__
(
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
num_layers
=
num_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
num_hidden_layers
=
num_layers
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
if
intermediate_size
:
self
.
intermediate_size
=
intermediate_size
else
:
self
.
intermediate_size
=
hidden_size
*
4
self
.
activation_function
=
activation_function
self
.
resid_dropout
=
resid_dropout
self
.
embed_dropout
=
embed_dropout
self
.
attention_dropout
=
attention_dropout
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
use_cache
=
use_cache
self
.
rotary_pct
=
rotary_pct
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
use_logit_cap
=
kwargs
.
pop
(
"use_logit_cap"
,
False
)
self
.
ln_no_scale
=
kwargs
.
pop
(
"ln_no_scale"
,
False
)
self
.
use_gated
=
kwargs
.
pop
(
"use_gated"
,
False
)
self
.
use_emb_norm
=
kwargs
.
pop
(
"use_emb_norm"
,
False
)
self
.
use_rotary_pos
=
kwargs
.
pop
(
"use_rotary_pos"
,
False
)
self
.
rotary_type
=
kwargs
.
pop
(
"rotary_type"
,
None
)
self
.
scaling_factor
=
kwargs
.
pop
(
"scaling_factor"
,
1
)
self
.
use_absolute_pos
=
kwargs
.
pop
(
"use_absolute_pos"
,
True
)
self
.
use_extra_logit
=
kwargs
.
pop
(
"use_extra_logit"
,
True
)
self
.
rotary_expand_length
=
kwargs
.
pop
(
"rotary_expand_length"
,
None
)
self
.
rotary_base
=
kwargs
.
pop
(
"rotary_base"
,
10000.0
)
self
.
use_qkv_fuse
=
kwargs
.
pop
(
"use_qkv_fuse"
,
False
)
self
.
rescale_before_lm_head
=
kwargs
.
pop
(
"rescale_before_lm_head"
,
(
rotary_pct
==
0.25
))
if
self
.
use_rotary_pos
:
self
.
use_absolute_pos
=
False
vllm/transformers_utils/configs/granite.py
0 → 100644
View file @
0640f227
# coding=utf-8
# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Granite model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.modeling_rope_utils
import
rope_config_validation
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
GraniteConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of
a [`GraniteModel`]. It is used to instantiate an Granite
model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Granite-3B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to
control the model outputs. Read the documentation from [`PretrainedConfig`]
for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Granite model. Defines the number of
different tokens that can be represented by the `inputs_ids`
passed when calling [`GraniteModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to
implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi
Head Attention (MHA), if `num_key_value_heads=1` the model will use
Multi Query Attention (MQA) otherwise GQA is used. When converting
a multi-head checkpoint to a GQA checkpoint, each group key and
value head should be constructed by meanpooling all the original
heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the
decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values
attentions (not used by all models). Only relevant if
`config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE
embeddings. Currently supports two scaling strategies: linear and
dynamic. Their scaling factor must be a float greater than 1. The
expected format is
`{"type": strategy name, "factor": scaling factor}`.
When using this flag, don't update `max_position_embeddings` to
the expected new maximum. See the following thread for more
information on how these scaling strategies behave:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
This is an experimental feature, subject to breaking API changes
in future versions.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output
projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers
in the MLP layers.
embedding_multiplier (`float`, *optional*, defaults to 1.0):
embedding multiplier
logits_scaling (`float`, *optional*, defaults to 1.0):
divisor for output logits
residual_multiplier (`float`, *optional*, defaults to 1.0):
residual multiplier
attention_multiplier (`float`, *optional*, defaults to 1.0):
attention multiplier
```python
>>> from transformers import GraniteModel, GraniteConfig
>>> # Initializing a Granite granite-3b style configuration
>>> configuration = GraniteConfig()
>>> # Initializing a model from the granite-7b style configuration
>>> model = GraniteModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"granite"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
embedding_multiplier
=
1.0
,
logits_scaling
=
1.0
,
residual_multiplier
=
1.0
,
attention_multiplier
=
1.0
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
mlp_bias
=
mlp_bias
self
.
embedding_multiplier
=
embedding_multiplier
self
.
logits_scaling
=
logits_scaling
self
.
residual_multiplier
=
residual_multiplier
self
.
attention_multiplier
=
attention_multiplier
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
rope_config_validation
(
self
)
vllm/transformers_utils/detokenizer.py
View file @
0640f227
...
@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens(
...
@@ -230,7 +230,7 @@ def convert_prompt_ids_to_tokens(
prefix_offset
=
max
(
prefix_offset
=
max
(
read_offset
-
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET
,
0
)
read_offset
-
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET
,
0
)
# This is required to guard against out-of-vocab prompt token ids
# This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty
(
new_tokens
)
_replace_none_with_empty
(
new_tokens
)
# type: ignore[arg-type]
return
new_tokens
,
prefix_offset
,
read_offset
return
new_tokens
,
prefix_offset
,
read_offset
...
...
vllm/transformers_utils/tokenizer.py
View file @
0640f227
import
os
import
os
import
warnings
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
...
@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
...
@@ -9,12 +10,15 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.transformers_utils.tokenizers
import
BaichuanTokenizer
from
vllm.transformers_utils.tokenizers
import
(
BaichuanTokenizer
,
MistralTokenizer
)
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.utils
import
make_async
from
vllm.utils
import
make_async
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
AnyTokenizer
=
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]
AnyTokenizer
=
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
MistralTokenizer
]
def
get_cached_tokenizer
(
tokenizer
:
AnyTokenizer
)
->
AnyTokenizer
:
def
get_cached_tokenizer
(
tokenizer
:
AnyTokenizer
)
->
AnyTokenizer
:
...
@@ -93,51 +97,69 @@ def get_tokenizer(
...
@@ -93,51 +97,69 @@ def get_tokenizer(
kwargs
[
"truncation_side"
]
=
"left"
kwargs
[
"truncation_side"
]
=
"left"
# Separate model folder from file path for GGUF models
# Separate model folder from file path for GGUF models
is_gguf
=
Path
(
tokenizer_name
).
is_file
()
and
Path
(
is_gguf
=
check_gguf_file
(
tokenizer_name
)
tokenizer_name
).
suffix
==
".gguf"
if
is_gguf
:
if
is_gguf
:
kwargs
[
"gguf_file"
]
=
Path
(
tokenizer_name
).
name
kwargs
[
"gguf_file"
]
=
Path
(
tokenizer_name
).
name
tokenizer_name
=
Path
(
tokenizer_name
).
parent
tokenizer_name
=
Path
(
tokenizer_name
).
parent
try
:
# if tokenizer is from official mistral org
tokenizer
=
AutoTokenizer
.
from_pretrained
(
is_from_mistral_org
=
str
(
tokenizer_name
).
split
(
"/"
)[
0
]
==
"mistralai"
tokenizer_name
,
if
is_from_mistral_org
and
tokenizer_mode
!=
"mistral"
:
*
args
,
warnings
.
warn
(
trust_remote_code
=
trust_remote_code
,
'It is strongly recommended to run mistral models with '
revision
=
revision
,
'`--tokenizer_mode "mistral"` to ensure correct '
**
kwargs
)
'encoding and decoding.'
,
except
ValueError
as
e
:
FutureWarning
,
# If the error pertains to the tokenizer class not existing or not
stacklevel
=
2
)
# currently being imported, suggest using the --trust-remote-code flag.
if
(
not
trust_remote_code
and
if
tokenizer_mode
==
"mistral"
:
(
"does not exist or is not currently imported."
in
str
(
e
)
tokenizer
=
MistralTokenizer
.
from_pretrained
(
str
(
tokenizer_name
),
or
"requires you to execute the tokenizer file"
in
str
(
e
))):
revision
=
revision
)
err_msg
=
(
else
:
"Failed to load the tokenizer. If the tokenizer is a custom "
try
:
"tokenizer not yet available in the HuggingFace transformers "
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
except
AttributeError
as
e
:
if
"BaichuanTokenizer"
in
str
(
e
):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer
=
BaichuanTokenizer
.
from_pretrained
(
tokenizer_name
,
tokenizer_name
,
*
args
,
*
args
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
revision
=
revision
,
**
kwargs
)
**
kwargs
,
else
:
)
raise
e
except
ValueError
as
e
:
# If the error pertains to the tokenizer class not existing or not
# currently being imported,
# suggest using the --trust-remote-code flag.
if
not
trust_remote_code
and
(
"does not exist or is not currently imported."
in
str
(
e
)
or
"requires you to execute the tokenizer file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
except
AttributeError
as
e
:
if
"BaichuanTokenizer"
in
str
(
e
):
# This is for the error "'BaichuanTokenizer' object has no
# attribute 'sp_model'".
tokenizer
=
BaichuanTokenizer
.
from_pretrained
(
tokenizer_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
**
kwargs
,
)
else
:
raise
e
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
tokenizer
=
get_cached_tokenizer
(
tokenizer
)
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
return
tokenizer
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return
get_cached_tokenizer
(
tokenizer
)
def
get_lora_tokenizer
(
lora_request
:
LoRARequest
,
*
args
,
def
get_lora_tokenizer
(
lora_request
:
LoRARequest
,
*
args
,
...
...
vllm/transformers_utils/tokenizers/__init__.py
View file @
0640f227
from
vllm.transformers_utils.tokenizers.baichuan
import
BaichuanTokenizer
from
vllm.transformers_utils.tokenizers.baichuan
import
BaichuanTokenizer
from
vllm.transformers_utils.tokenizers.mistral
import
MistralTokenizer
__all__
=
[
__all__
=
[
"BaichuanTokenizer"
,
"MistralTokenizer"
]
"BaichuanTokenizer"
,
]
vllm/transformers_utils/tokenizers/mistral.py
0 → 100644
View file @
0640f227
import
os
import
re
from
dataclasses
import
dataclass
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Union
from
huggingface_hub
import
HfApi
,
hf_hub_download
# yapf: disable
from
mistral_common.tokens.tokenizers.mistral
import
ChatCompletionRequest
from
mistral_common.tokens.tokenizers.mistral
import
(
MistralTokenizer
as
PublicMistralTokenizer
)
# yapf: enable
from
mistral_common.tokens.tokenizers.sentencepiece
import
(
SentencePieceTokenizer
)
from
mistral_common.tokens.tokenizers.tekken
import
(
SpecialTokenPolicy
,
Tekkenizer
)
if
TYPE_CHECKING
:
from
vllm.entrypoints.chat_utils
import
ConversationMessage
@
dataclass
class
Encoding
:
input_ids
:
List
[
int
]
def
find_tokenizer_file
(
files
:
List
[
str
]):
file_pattern
=
re
.
compile
(
r
"^tokenizer\.model\.v.*$|^tekken\.json$"
)
matched_files
=
[
file
for
file
in
files
if
file_pattern
.
match
(
file
)]
if
len
(
matched_files
)
>
1
:
raise
OSError
(
f
"Found
{
len
(
matched_files
)
}
files matching the "
"pattern: {matched_files}. Make sure only one Mistral "
"tokenizer is present in {tokenizer_name}."
)
elif
len
(
matched_files
)
==
0
:
raise
OSError
(
f
"Found
{
len
(
matched_files
)
}
files matching the "
"pattern: {matched_files}. Make sure that a Mistral "
"tokenizer is present in {tokenizer_name}."
)
return
matched_files
[
0
]
class
MistralTokenizer
:
def
__init__
(
self
,
tokenizer
:
PublicMistralTokenizer
)
->
None
:
self
.
mistral
=
tokenizer
self
.
instruct
=
tokenizer
.
instruct_tokenizer
self
.
tokenizer
=
tokenizer
.
instruct_tokenizer
.
tokenizer
self
.
vocab_size
=
len
(
self
.
tokenizer
.
vocab
())
assert
isinstance
(
self
.
tokenizer
,
(
Tekkenizer
,
SentencePieceTokenizer
)),
type
(
self
.
tokenizer
)
if
(
is_tekken
:
=
isinstance
(
self
.
tokenizer
,
Tekkenizer
)):
# Make sure special tokens will not raise
self
.
tokenizer
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
self
.
_is_tekken
=
is_tekken
# the following attributes are set to fit VLLM's design
self
.
is_fast
=
True
self
.
chat_template
=
True
self
.
all_special_ids
:
List
[
Any
]
=
[]
self
.
all_special_tokens
:
List
[
Any
]
=
[]
self
.
all_special_tokens_extended
:
List
[
Any
]
=
[]
@
classmethod
def
from_pretrained
(
cls
,
path_or_repo_id
:
str
,
*
,
revision
:
Optional
[
str
]
=
None
)
->
"MistralTokenizer"
:
if
not
Path
(
path_or_repo_id
).
exists
():
assert
len
(
path_or_repo_id
.
split
(
"/"
))
==
2
,
(
"You have either provided a non-existent path: "
"{path_or_repo_id} or an invalid HF Hub repo id."
)
tokenizer_file
=
cls
.
_download_mistral_tokenizer_from_hf
(
path_or_repo_id
,
revision
)
elif
Path
(
path_or_repo_id
).
is_dir
():
tokenizer_file_name
=
find_tokenizer_file
(
os
.
listdir
(
path_or_repo_id
))
tokenizer_file
=
str
(
Path
(
path_or_repo_id
)
/
tokenizer_file_name
)
else
:
assert
Path
(
path_or_repo_id
).
is_file
(),
f
"Invalid path:
{
path_or_repo_id
}
"
mistral_tokenizer
=
PublicMistralTokenizer
.
from_file
(
tokenizer_file
)
return
cls
(
mistral_tokenizer
)
@
staticmethod
def
_download_mistral_tokenizer_from_hf
(
tokenizer_name
:
str
,
revision
:
Optional
[
str
])
->
str
:
api
=
HfApi
()
repo_info
=
api
.
model_info
(
tokenizer_name
)
files
=
[
s
.
rfilename
for
s
in
repo_info
.
siblings
]
filename
=
find_tokenizer_file
(
files
)
tokenizer_file
=
hf_hub_download
(
tokenizer_name
,
filename
=
filename
,
revision
=
revision
)
return
tokenizer_file
def
__call__
(
self
,
prompt
:
str
,
add_special_tokens
:
bool
=
False
,
truncation
:
bool
=
False
,
max_length
:
Optional
[
int
]
=
None
,
):
# Mistral Tokenizers should not add special tokens
input_ids
=
self
.
encode
(
prompt
)
if
truncation
:
input_ids
=
input_ids
[:
max_length
]
return
Encoding
(
input_ids
=
input_ids
)
def
get_added_vocab
(
self
)
->
List
[
str
]:
# Mistral tokenizers have no added vocabulary
return
[]
def
encode
(
self
,
prompt
:
str
)
->
List
[
int
]:
# `encode ` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
return
self
.
tokenizer
.
encode
(
prompt
,
bos
=
True
,
eos
=
False
)
def
apply_chat_template
(
self
,
conversation
:
List
[
"ConversationMessage"
],
tools
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
List
[
int
]:
assert
tools
is
None
,
"`tools` are not yet supported."
request
=
ChatCompletionRequest
(
messages
=
conversation
)
# type: ignore[type-var]
encoded
=
self
.
mistral
.
encode_chat_completion
(
request
)
# encode-decode to get clean prompt
return
encoded
.
tokens
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
self
.
_is_tekken
:
return
""
.
join
(
tokens
)
else
:
return
self
.
tokenizer
.
decode
(
tokens
)
# type: ignore[arg-type]
def
decode
(
self
,
ids
:
Union
[
List
[
int
],
int
])
->
str
:
if
isinstance
(
ids
,
int
):
ids
=
[
ids
]
return
self
.
tokenizer
.
decode
(
ids
)
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_id
def
convert_ids_to_tokens
(
self
,
ids
:
List
[
int
],
skip_special_tokens
:
Optional
[
bool
]
=
True
)
->
List
[
str
]:
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert
(
skip_special_tokens
),
"Skipping special tokens is not supported for Mistral tokenizers."
assert
isinstance
(
self
.
tokenizer
,
(
Tekkenizer
,
SentencePieceTokenizer
)),
type
(
self
.
tokenizer
)
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
return
tokens
def
__len__
(
self
):
return
self
.
vocab_size
vllm/transformers_utils/utils.py
0 → 100644
View file @
0640f227
from
os
import
PathLike
from
pathlib
import
Path
from
typing
import
Union
def
check_gguf_file
(
model
:
Union
[
str
,
PathLike
])
->
bool
:
"""Check if the file is a GGUF model."""
model
=
Path
(
model
)
if
not
model
.
is_file
():
return
False
elif
model
.
suffix
==
".gguf"
:
return
True
with
open
(
model
,
"rb"
)
as
f
:
header
=
f
.
read
(
4
)
return
header
==
b
"GGUF"
vllm/utils.py
View file @
0640f227
...
@@ -25,6 +25,8 @@ import numpy.typing as npt
...
@@ -25,6 +25,8 @@ import numpy.typing as npt
import
psutil
import
psutil
import
torch
import
torch
import
torch.types
import
torch.types
import
yaml
from
packaging.version
import
Version
from
typing_extensions
import
ParamSpec
,
TypeIs
,
assert_never
from
typing_extensions
import
ParamSpec
,
TypeIs
,
assert_never
import
vllm.envs
as
envs
import
vllm.envs
as
envs
...
@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
...
@@ -1092,6 +1094,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
if
args
is
None
:
if
args
is
None
:
args
=
sys
.
argv
[
1
:]
args
=
sys
.
argv
[
1
:]
if
'--config'
in
args
:
args
=
FlexibleArgumentParser
.
_pull_args_from_config
(
args
)
# Convert underscores to dashes and vice versa in argument names
# Convert underscores to dashes and vice versa in argument names
processed_args
=
[]
processed_args
=
[]
for
arg
in
args
:
for
arg
in
args
:
...
@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
...
@@ -1108,9 +1113,114 @@ class FlexibleArgumentParser(argparse.ArgumentParser):
return
super
().
parse_args
(
processed_args
,
namespace
)
return
super
().
parse_args
(
processed_args
,
namespace
)
@
staticmethod
def
_pull_args_from_config
(
args
:
List
[
str
])
->
List
[
str
]:
"""Method to pull arguments specified in the config file
into the command-line args variable.
The arguments in config file will be inserted between
the argument list.
example:
```yaml
port: 12323
tensor-parallel-size: 4
```
```python
$: vllm {serve,chat,complete} "facebook/opt-12B"
\
--config config.yaml -tp 2
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--config', 'config.yaml',
'-tp', '2'
]
$: args = [
"serve,chat,complete",
"facebook/opt-12B",
'--port', '12323',
'--tensor-parallel-size', '4',
'-tp', '2'
]
```
Please note how the config args are inserted after the sub command.
this way the order of priorities is maintained when these are args
parsed by super().
"""
assert
args
.
count
(
'--config'
)
<=
1
,
"More than one config file specified!"
index
=
args
.
index
(
'--config'
)
if
index
==
len
(
args
)
-
1
:
raise
ValueError
(
"No config file specified!
\
Please check your command-line arguments."
)
file_path
=
args
[
index
+
1
]
config_args
=
FlexibleArgumentParser
.
_load_config_file
(
file_path
)
# 0th index is for {serve,chat,complete}
# followed by config args
# followed by rest of cli args.
# maintaining this order will enforce the precedence
# of cli > config > defaults
args
=
[
args
[
0
]]
+
config_args
+
args
[
1
:
index
]
+
args
[
index
+
2
:]
return
args
@
staticmethod
def
_load_config_file
(
file_path
:
str
)
->
List
[
str
]:
"""Loads a yaml file and returns the key value pairs as a
flattened list with argparse like pattern
```yaml
port: 12323
tensor-parallel-size: 4
```
returns:
processed_args: list[str] = [
'--port': '12323',
'--tensor-parallel-size': '4'
]
"""
extension
:
str
=
file_path
.
split
(
'.'
)[
-
1
]
if
extension
not
in
(
'yaml'
,
'yml'
):
raise
ValueError
(
"Config file must be of a yaml/yml type.
\
%s supplied"
,
extension
)
# only expecting a flat dictionary of atomic types
processed_args
:
List
[
str
]
=
[]
config
:
Dict
[
str
,
Union
[
int
,
str
]]
=
{}
try
:
with
open
(
file_path
,
'r'
)
as
config_file
:
config
=
yaml
.
safe_load
(
config_file
)
except
Exception
as
ex
:
logger
.
error
(
"Unable to read the config file at %s.
\
Make sure path is correct"
,
file_path
)
raise
ex
for
key
,
value
in
config
.
items
():
processed_args
.
append
(
'--'
+
key
)
processed_args
.
append
(
str
(
value
))
return
processed_args
async
def
_run_task_with_lock
(
task
:
Callable
,
lock
:
asyncio
.
Lock
,
*
args
,
async
def
_run_task_with_lock
(
task
:
Callable
,
lock
:
asyncio
.
Lock
,
*
args
,
**
kwargs
):
**
kwargs
):
"""Utility function to run async task in a lock"""
"""Utility function to run async task in a lock"""
async
with
lock
:
async
with
lock
:
return
await
task
(
*
args
,
**
kwargs
)
return
await
task
(
*
args
,
**
kwargs
)
# Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
# In particular, the FakeScalarType is not supported for earlier versions of
# PyTorch which breaks dynamo for any ops registered using ScalarType.
def
supports_dynamo
()
->
bool
:
base_torch_version
=
Version
(
Version
(
torch
.
__version__
).
base_version
)
return
base_torch_version
>=
Version
(
"2.4.0"
)
vllm/version.py
View file @
0640f227
...
@@ -9,4 +9,4 @@ except Exception as e:
...
@@ -9,4 +9,4 @@ except Exception as e:
stacklevel
=
2
)
stacklevel
=
2
)
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__commit__
=
"COMMIT_HASH_PLACEHOLDER"
__version__
=
"0.
5.5
"
__version__
=
"0.
6.0
"
vllm/worker/cpu_model_runner.py
View file @
0640f227
...
@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
...
@@ -10,11 +10,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
SchedulerConfig
)
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
MultiModalInputs
)
MultiModalInputs
)
from
vllm.sequence
import
(
IntermediateTensors
,
SamplerOutput
,
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
SequenceGroupMetadata
)
from
vllm.utils
import
make_tensor_with_pad
from
vllm.utils
import
make_tensor_with_pad
from
vllm.worker.model_runner_base
import
(
from
vllm.worker.model_runner_base
import
(
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerBase
,
ModelRunnerInputBase
,
...
...
vllm/worker/enc_dec_model_runner.py
View file @
0640f227
...
@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
...
@@ -16,9 +16,10 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
from
vllm.inputs
import
INPUT_REGISTRY
,
InputRegistry
from
vllm.inputs
import
INPUT_REGISTRY
,
InputRegistry
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
SamplerOutput
,
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
SequenceGroupMetadata
)
SequenceGroupMetadata
)
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_BACKEND
,
make_tensor_with_pad
from
vllm.utils
import
STR_NOT_IMPL_ENC_DEC_BACKEND
,
make_tensor_with_pad
from
vllm.worker.model_runner
import
(
GPUModelRunnerBase
,
from
vllm.worker.model_runner
import
(
GPUModelRunnerBase
,
...
...
Prev
1
…
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment