Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
823ab796
Unverified
Commit
823ab796
authored
Jan 28, 2025
by
Harry Mellor
Committed by
GitHub
Jan 27, 2025
Browse files
Update `pre-commit` hooks (#12475)
Signed-off-by:
Harry Mellor
<
19981378+hmellor@users.noreply.github.com
>
parent
6116ca8c
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
69 additions
and
61 deletions
+69
-61
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/vocab_parallel_embedding.py
+8
-8
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+3
-2
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer.py
+2
-2
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+2
-2
vllm/model_executor/models/granitemoe.py
vllm/model_executor/models/granitemoe.py
+3
-3
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+2
-2
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/mlp_speculator.py
+2
-2
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+4
-4
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+2
-1
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+4
-4
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+2
-3
vllm/model_executor/sampling_metadata.py
vllm/model_executor/sampling_metadata.py
+8
-3
vllm/platforms/neuron.py
vllm/platforms/neuron.py
+2
-2
vllm/scalar_type.py
vllm/scalar_type.py
+2
-2
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+2
-2
vllm/spec_decode/top1_proposer.py
vllm/spec_decode/top1_proposer.py
+5
-5
vllm/spec_decode/util.py
vllm/spec_decode/util.py
+7
-5
vllm/transformers_utils/configs/nemotron.py
vllm/transformers_utils/configs/nemotron.py
+2
-2
vllm/utils.py
vllm/utils.py
+5
-5
vllm/v1/core/scheduler.py
vllm/v1/core/scheduler.py
+2
-2
No files found.
vllm/model_executor/layers/vocab_parallel_embedding.py
View file @
823ab796
...
@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
...
@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
def
__post_init__
(
self
):
def
__post_init__
(
self
):
# sanity checks
# sanity checks
assert
(
self
.
padded_org_vocab_start_index
<=
assert
(
self
.
padded_org_vocab_start_index
self
.
padded_org_vocab_end_index
)
<=
self
.
padded_org_vocab_end_index
)
assert
(
self
.
padded_added_vocab_start_index
<=
assert
(
self
.
padded_added_vocab_start_index
self
.
padded_added_vocab_end_index
)
<=
self
.
padded_added_vocab_end_index
)
assert
self
.
org_vocab_start_index
<=
self
.
org_vocab_end_index
assert
self
.
org_vocab_start_index
<=
self
.
org_vocab_end_index
assert
self
.
added_vocab_start_index
<=
self
.
added_vocab_end_index
assert
self
.
added_vocab_start_index
<=
self
.
added_vocab_end_index
assert
self
.
org_vocab_start_index
<=
self
.
padded_org_vocab_start_index
assert
self
.
org_vocab_start_index
<=
self
.
padded_org_vocab_start_index
assert
(
self
.
added_vocab_start_index
<=
assert
(
self
.
added_vocab_start_index
self
.
padded_added_vocab_start_index
)
<=
self
.
padded_added_vocab_start_index
)
assert
self
.
org_vocab_end_index
<=
self
.
padded_org_vocab_end_index
assert
self
.
org_vocab_end_index
<=
self
.
padded_org_vocab_end_index
assert
self
.
added_vocab_end_index
<=
self
.
padded_added_vocab_end_index
assert
self
.
added_vocab_end_index
<=
self
.
padded_added_vocab_end_index
...
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
...
@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
added_vocab_end_index
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
added_vocab_end_index
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# torch.compile will fuse all of the pointwise ops below
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
# into a single kernel, making it very fast
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
org_vocab_end_index
)
input_
<
org_vocab_end_index
)
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
input_
<
added_vocab_end_index
)
input_
<
added_vocab_end_index
)
added_offset
=
added_vocab_start_index
-
(
added_offset
=
added_vocab_start_index
-
(
...
...
vllm/model_executor/model_loader/loader.py
View file @
823ab796
...
@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...
@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# from being incorrectly identified as being present in
# from being incorrectly identified as being present in
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
shard_pos
=
quant_param_name
.
find
(
shard_name
)
shard_pos
=
quant_param_name
.
find
(
shard_name
)
can_correct_rename
=
(
shard_pos
>
0
)
and
(
can_correct_rename
=
(
shard_pos
quant_param_name
[
shard_pos
-
1
]
==
"."
)
>
0
)
and
(
quant_param_name
[
shard_pos
-
1
]
==
"."
)
# If the quant_param_name is packed, it won't occur in the
# If the quant_param_name is packed, it won't occur in the
# param_dict before renaming.
# param_dict before renaming.
new_quant_param_name
=
quant_param_name
.
replace
(
new_quant_param_name
=
quant_param_name
.
replace
(
...
...
vllm/model_executor/model_loader/tensorizer.py
View file @
823ab796
...
@@ -298,8 +298,8 @@ class TensorizerAgent:
...
@@ -298,8 +298,8 @@ class TensorizerAgent:
to allow for adapter added tokens."""
to allow for adapter added tokens."""
for
child
in
self
.
model
.
modules
():
for
child
in
self
.
model
.
modules
():
if
(
isinstance
(
child
,
VocabParallelEmbedding
)
if
(
isinstance
(
child
,
VocabParallelEmbedding
)
and
child
.
weight
.
shape
[
0
]
<
and
child
.
weight
.
shape
[
0
]
child
.
num_embeddings_per_partition
):
<
child
.
num_embeddings_per_partition
):
new_weight
=
torch
.
empty
(
child
.
num_embeddings_per_partition
,
new_weight
=
torch
.
empty
(
child
.
num_embeddings_per_partition
,
child
.
embedding_dim
,
child
.
embedding_dim
,
dtype
=
child
.
weight
.
dtype
,
dtype
=
child
.
weight
.
dtype
,
...
...
vllm/model_executor/models/gemma.py
View file @
823ab796
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights."""
"""Inference-only Gemma model compatible with HuggingFace weights."""
from
functools
import
lru_
cache
from
functools
import
cache
from
typing
import
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
from
typing
import
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
import
torch
import
torch
...
@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
...
@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
@
lru_
cache
(
maxsize
=
None
)
@
cache
def
_get_gemma_act_fn
(
def
_get_gemma_act_fn
(
hidden_act
:
Optional
[
str
],
hidden_act
:
Optional
[
str
],
hidden_activation
:
Optional
[
str
],
hidden_activation
:
Optional
[
str
],
...
...
vllm/model_executor/models/granitemoe.py
View file @
823ab796
...
@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for
e
in
range
(
p
.
size
(
0
)):
for
e
in
range
(
p
.
size
(
0
)):
w1_name
=
n
.
replace
(
w1_name
=
n
.
replace
(
'.block_sparse_moe.input_linear.weight'
,
'.block_sparse_moe.input_linear.weight'
,
".block_sparse_moe.experts.
%d
.w1.weight"
%
e
)
f
".block_sparse_moe.experts.
{
e
}
.w1.weight"
)
w3_name
=
n
.
replace
(
w3_name
=
n
.
replace
(
'.block_sparse_moe.input_linear.weight'
,
'.block_sparse_moe.input_linear.weight'
,
".block_sparse_moe.experts.
%d
.w3.weight"
%
e
)
f
".block_sparse_moe.experts.
{
e
}
.w3.weight"
)
w1_param
,
w3_param
=
p
[
e
].
chunk
(
2
,
dim
=
0
)
w1_param
,
w3_param
=
p
[
e
].
chunk
(
2
,
dim
=
0
)
assert
w1_name
not
in
new_weights
assert
w1_name
not
in
new_weights
assert
w3_name
not
in
new_weights
assert
w3_name
not
in
new_weights
...
@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for
e
in
range
(
p
.
size
(
0
)):
for
e
in
range
(
p
.
size
(
0
)):
w2_name
=
n
.
replace
(
w2_name
=
n
.
replace
(
'.block_sparse_moe.output_linear.weight'
,
'.block_sparse_moe.output_linear.weight'
,
".block_sparse_moe.experts.
%d
.w2.weight"
%
e
)
f
".block_sparse_moe.experts.
{
e
}
.w2.weight"
)
w2_param
=
p
[
e
]
w2_param
=
p
[
e
]
assert
w2_name
not
in
new_weights
assert
w2_name
not
in
new_weights
new_weights
[
w2_name
]
=
w2_param
new_weights
[
w2_name
]
=
w2_param
...
...
vllm/model_executor/models/mllama.py
View file @
823ab796
...
@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
# For 1) text-only prefill and decode, 2) image-present decode.
# For 1) text-only prefill and decode, 2) image-present decode.
if
image_inputs
is
None
:
if
image_inputs
is
None
:
full_text_row_masked_out_mask
=
(
full_text_row_masked_out_mask
=
(
attn_metadata
.
encoder_seq_lens_tensor
!=
0
).
reshape
(
-
1
,
1
).
to
(
attn_metadata
.
encoder_seq_lens_tensor
input_ids
.
device
)
!=
0
).
reshape
(
-
1
,
1
).
to
(
input_ids
.
device
)
skip_cross_attention
=
max
(
attn_metadata
.
encoder_seq_lens
)
==
0
skip_cross_attention
=
max
(
attn_metadata
.
encoder_seq_lens
)
==
0
# For image-present prefill.
# For image-present prefill.
...
...
vllm/model_executor/models/mlp_speculator.py
View file @
823ab796
...
@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
...
@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
if
self
.
tie_weights
:
if
self
.
tie_weights
:
assert
(
assert
(
self
.
n_predict
>
self
.
n_predict
>
1
1
),
"You cannot tie weights between stages when only 1 exists"
),
"You cannot tie weights between stages when only 1 exists"
embedding
=
VocabParallelEmbedding
(
embedding
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
vocab_size
,
self
.
inner_dim
,
self
.
inner_dim
,
...
...
vllm/model_executor/models/phimoe.py
View file @
823ab796
...
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
...
@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
# compute mask for sparsity
# compute mask for sparsity
mask_logits_threshold
,
max_ind
=
scores
.
max
(
dim
=-
1
,
keepdim
=
True
)
mask_logits_threshold
,
max_ind
=
scores
.
max
(
dim
=-
1
,
keepdim
=
True
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
mask_logits_threshold
=
(
mask_logits_threshold
=
(
(
mask_logits_threshold
-
scores
)
/
(
mask_logits_threshold
-
scores
)
/
factor
)
>
(
2
*
jitter_eps
)
factor
)
>
(
2
*
jitter_eps
)
# apply mask
# apply mask
masked_gates
=
scores
.
masked_fill
(
mask_logits_threshold
,
float
(
"-inf"
))
masked_gates
=
scores
.
masked_fill
(
mask_logits_threshold
,
float
(
"-inf"
))
...
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
...
@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
mask_logits_threshold
,
max_ind
=
masked_scores
.
max
(
dim
=-
1
,
mask_logits_threshold
,
max_ind
=
masked_scores
.
max
(
dim
=-
1
,
keepdim
=
True
)
keepdim
=
True
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
mask_logits_threshold
=
(
mask_logits_threshold
=
(
(
mask_logits_threshold
-
scores
)
/
(
mask_logits_threshold
-
scores
)
/
factor
)
>
(
2
*
jitter_eps
)
factor
)
>
(
2
*
jitter_eps
)
# apply mask
# apply mask
masked_gates_top2
=
masked_scores
.
masked_fill
(
mask_logits_threshold
,
masked_gates_top2
=
masked_scores
.
masked_fill
(
mask_logits_threshold
,
...
...
vllm/model_executor/models/registry.py
View file @
823ab796
...
@@ -462,7 +462,8 @@ class _ModelRegistry:
...
@@ -462,7 +462,8 @@ class _ModelRegistry:
ModelRegistry
=
_ModelRegistry
({
ModelRegistry
=
_ModelRegistry
({
model_arch
:
_LazyRegisteredModel
(
model_arch
:
_LazyRegisteredModel
(
module_name
=
f
"vllm.model_executor.models.
{
mod_relname
}
"
,
module_name
=
f
"vllm.model_executor.models.
{
mod_relname
}
"
,
class_name
=
cls_name
,
class_name
=
cls_name
,
)
)
...
...
vllm/model_executor/models/ultravox.py
View file @
823ab796
...
@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
...
@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
return
hidden_states
return
hidden_states
@
MULTIMODAL_REGISTRY
.
register_processor
(
UltravoxMultiModalProcessor
,
@
MULTIMODAL_REGISTRY
.
register_processor
(
info
=
UltravoxProcessingInfo
,
UltravoxMultiModalProcessor
,
dummy_inputs
=
UltravoxDummyInputsBuilder
info
=
UltravoxProcessingInfo
,
)
dummy_inputs
=
UltravoxDummyInputsBuilder
)
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
class
UltravoxModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
hf_to_vllm_mapper
=
WeightsMapper
(
...
...
vllm/model_executor/models/utils.py
View file @
823ab796
...
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
...
@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
device
:
torch
.
device
,
device
:
torch
.
device
,
)
->
IntermediateTensors
:
)
->
IntermediateTensors
:
return
IntermediateTensors
({
return
IntermediateTensors
({
key
:
torch
.
zeros
((
batch_size
,
hidden_size
),
key
:
dtype
=
dtype
,
torch
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
dtype
,
device
=
device
)
device
=
device
)
for
key
in
keys
for
key
in
keys
})
})
...
...
vllm/model_executor/sampling_metadata.py
View file @
823ab796
...
@@ -166,7 +166,8 @@ class SamplingMetadata:
...
@@ -166,7 +166,8 @@ class SamplingMetadata:
pin_memory
=
pin_memory
,
pin_memory
=
pin_memory
,
)
)
categorized_sample_indices
=
{
categorized_sample_indices
=
{
t
:
async_tensor_h2d
(
t
:
async_tensor_h2d
(
seq_ids
,
seq_ids
,
dtype
=
torch
.
int
,
dtype
=
torch
.
int
,
target_device
=
device
,
target_device
=
device
,
...
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
...
@@ -198,8 +199,12 @@ def _prepare_seq_groups(
device
:
str
,
device
:
str
,
generators
:
Optional
[
Dict
[
str
,
torch
.
Generator
]]
=
None
,
generators
:
Optional
[
Dict
[
str
,
torch
.
Generator
]]
=
None
,
cache
:
Optional
[
SamplingMetadataCache
]
=
None
,
cache
:
Optional
[
SamplingMetadataCache
]
=
None
,
)
->
Tuple
[
List
[
SequenceGroupToSample
],
List
[
int
],
Dict
[
SamplingType
,
)
->
Tuple
[
List
[
int
]],
int
,
]:
List
[
SequenceGroupToSample
],
List
[
int
],
Dict
[
SamplingType
,
List
[
int
]],
int
,
]:
"""Prepare sequence groups and indices for sampling.
"""Prepare sequence groups and indices for sampling.
Args:
Args:
...
...
vllm/platforms/neuron.py
View file @
823ab796
...
@@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
...
@@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
if
parallel_config
.
world_size
>
1
:
if
parallel_config
.
world_size
>
1
:
parallel_config
.
distributed_executor_backend
=
"uni"
parallel_config
.
distributed_executor_backend
=
"uni"
assert
(
vllm_config
.
lora_config
is
assert
(
vllm_config
.
lora_config
None
),
"LoRA is not supported for Neuron backend."
is
None
),
"LoRA is not supported for Neuron backend."
assert
(
not
vllm_config
.
speculative_config
assert
(
not
vllm_config
.
speculative_config
),
"Speculative decoding not yet supported for Neuron backend."
),
"Speculative decoding not yet supported for Neuron backend."
...
...
vllm/scalar_type.py
View file @
823ab796
...
@@ -121,8 +121,8 @@ class ScalarType:
...
@@ -121,8 +121,8 @@ class ScalarType:
min_raw
=
max_raw
|
sign_bit_double
min_raw
=
max_raw
|
sign_bit_double
return
struct
.
unpack
(
'!d'
,
struct
.
pack
(
'!Q'
,
min_raw
))[
0
]
return
struct
.
unpack
(
'!d'
,
struct
.
pack
(
'!Q'
,
min_raw
))[
0
]
else
:
else
:
assert
(
not
self
.
is_signed
()
or
assert
(
not
self
.
is_signed
()
or
self
.
size_bits
self
.
size_bits
<=
64
),
"Cannot represent min as a int64_t"
<=
64
),
"Cannot represent min as a int64_t"
if
self
.
is_signed
():
if
self
.
is_signed
():
return
-
(
1
<<
(
self
.
size_bits
-
1
))
return
-
(
1
<<
(
self
.
size_bits
-
1
))
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
823ab796
...
@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
...
@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self
,
execute_model_req
:
ExecuteModelRequest
)
->
bool
:
self
,
execute_model_req
:
ExecuteModelRequest
)
->
bool
:
# When the batch size is too large, disable speculative decoding
# When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency.
# to stop trading off throughput for latency.
return
(
execute_model_req
.
running_queue_size
>=
return
(
execute_model_req
.
running_queue_size
self
.
disable_by_batch_size
)
>=
self
.
disable_by_batch_size
)
def
_maybe_disable_speculative_tokens
(
def
_maybe_disable_speculative_tokens
(
self
,
disable_all_speculation
:
bool
,
self
,
disable_all_speculation
:
bool
,
...
...
vllm/spec_decode/top1_proposer.py
View file @
823ab796
...
@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
...
@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
sampler_transposed
=
transposed
,
sampler_transposed
=
transposed
,
)
)
proposals
=
SpeculativeProposals
(
proposals
=
SpeculativeProposals
(
proposal_token_ids
=
proposal_tokens
,
proposal_
token_id
s
=
proposal_
token
s
,
proposal_
prob
s
=
proposal_
prob
s
,
proposal_
prob
s
=
proposal_
prob
s
,
proposal_
len
s
=
proposal_
len
s
,
proposal_lens
=
proposal_lens
,
no_proposals
=
maybe_sampler_output
no_proposals
=
maybe_sampler_output
is
None
)
is
None
)
return
proposals
return
proposals
def
_split_by_proposal_len
(
def
_split_by_proposal_len
(
...
...
vllm/spec_decode/util.py
View file @
823ab796
...
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
...
@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
"""
"""
num_steps
,
batch_size
,
vocab_size
=
logprob_tensor
.
shape
num_steps
,
batch_size
,
vocab_size
=
logprob_tensor
.
shape
selected_logprobs
=
logprob_tensor
[
torch
.
arange
(
num_steps
).
unsqueeze
(
1
),
selected_logprobs
=
logprob_tensor
[
torch
.
arange
(
batch_size
),
torch
.
arange
(
num_steps
).
unsqueeze
(
1
),
sampled_token_ids
,
]
torch
.
arange
(
batch_size
),
sampled_token_ids
,
]
expanded_selected_logprobs
=
selected_logprobs
.
unsqueeze
(
-
1
).
expand
(
expanded_selected_logprobs
=
selected_logprobs
.
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
vocab_size
)
-
1
,
-
1
,
vocab_size
)
sampled_token_ids_ranks
=
(
logprob_tensor
>
sampled_token_ids_ranks
=
(
logprob_tensor
expanded_selected_logprobs
).
sum
(
-
1
).
add_
(
1
)
>
expanded_selected_logprobs
).
sum
(
-
1
).
add_
(
1
)
return
sampled_token_ids_ranks
,
selected_logprobs
return
sampled_token_ids_ranks
,
selected_logprobs
...
...
vllm/transformers_utils/configs/nemotron.py
View file @
823ab796
...
@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
...
@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
if
self
.
rope_scaling
is
None
:
if
self
.
rope_scaling
is
None
:
return
return
if
not
isinstance
(
self
.
rope_scaling
,
if
not
isinstance
(
self
.
rope_scaling
,
dict
)
or
len
(
dict
)
or
len
(
self
.
rope_scaling
)
!=
2
:
self
.
rope_scaling
)
!=
2
:
raise
ValueError
(
raise
ValueError
(
"`rope_scaling` must be a dictionary with two fields, "
"`rope_scaling` must be a dictionary with two fields, "
f
"`type` and `factor`, got
{
self
.
rope_scaling
}
"
)
f
"`type` and `factor`, got
{
self
.
rope_scaling
}
"
)
...
...
vllm/utils.py
View file @
823ab796
...
@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
...
@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
from
collections
import
OrderedDict
,
UserDict
,
defaultdict
from
collections
import
OrderedDict
,
UserDict
,
defaultdict
from
collections.abc
import
Hashable
,
Iterable
,
Mapping
from
collections.abc
import
Hashable
,
Iterable
,
Mapping
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
functools
import
lru_cache
,
partial
,
wraps
from
functools
import
cache
,
lru_cache
,
partial
,
wraps
from
typing
import
(
TYPE_CHECKING
,
Any
,
AsyncGenerator
,
Awaitable
,
Callable
,
from
typing
import
(
TYPE_CHECKING
,
Any
,
AsyncGenerator
,
Awaitable
,
Callable
,
Dict
,
Generator
,
Generic
,
Iterator
,
List
,
Literal
,
Dict
,
Generator
,
Generic
,
Iterator
,
List
,
Literal
,
NamedTuple
,
Optional
,
Tuple
,
Type
,
TypeVar
,
Union
,
NamedTuple
,
Optional
,
Tuple
,
Type
,
TypeVar
,
Union
,
...
@@ -352,7 +352,7 @@ class PyObjectCache:
...
@@ -352,7 +352,7 @@ class PyObjectCache:
self
.
_index
=
0
self
.
_index
=
0
@
lru_
cache
(
maxsize
=
None
)
@
cache
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
"""Returns the maximum shared memory per thread block in bytes."""
"""Returns the maximum shared memory per thread block in bytes."""
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
...
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
...
@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
return
key_caches
,
value_caches
return
key_caches
,
value_caches
@
lru_
cache
(
maxsize
=
None
)
@
cache
def
is_pin_memory_available
()
->
bool
:
def
is_pin_memory_available
()
->
bool
:
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
return
current_platform
.
is_pin_memory_available
()
return
current_platform
.
is_pin_memory_available
()
...
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
...
@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
init_hf_modules
()
init_hf_modules
()
@
lru_
cache
(
maxsize
=
None
)
@
cache
def
find_library
(
lib_name
:
str
)
->
str
:
def
find_library
(
lib_name
:
str
)
->
str
:
"""
"""
Find the library file in the system.
Find the library file in the system.
...
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
...
@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
return
module
return
module
@
lru_
cache
(
maxsize
=
None
)
@
cache
def
get_vllm_optional_dependencies
():
def
get_vllm_optional_dependencies
():
metadata
=
importlib
.
metadata
.
metadata
(
"vllm"
)
metadata
=
importlib
.
metadata
.
metadata
(
"vllm"
)
requirements
=
metadata
.
get_all
(
"Requires-Dist"
,
[])
requirements
=
metadata
.
get_all
(
"Requires-Dist"
,
[])
...
...
vllm/v1/core/scheduler.py
View file @
823ab796
...
@@ -247,8 +247,8 @@ class Scheduler:
...
@@ -247,8 +247,8 @@ class Scheduler:
token_budget
-=
num_new_tokens
token_budget
-=
num_new_tokens
request
.
status
=
RequestStatus
.
RUNNING
request
.
status
=
RequestStatus
.
RUNNING
request
.
num_computed_tokens
=
num_computed_tokens
request
.
num_computed_tokens
=
num_computed_tokens
has_partial_request
=
(
num_computed_tokens
+
num_new_tokens
<
has_partial_request
=
(
num_computed_tokens
+
num_new_tokens
request
.
num_tokens
)
<
request
.
num_tokens
)
# Encoder-related.
# Encoder-related.
if
encoder_inputs_to_schedule
:
if
encoder_inputs_to_schedule
:
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment