Unverified Commit 823ab796 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `pre-commit` hooks (#12475)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 6116ca8c
...@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices: ...@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
def __post_init__(self): def __post_init__(self):
# sanity checks # sanity checks
assert (self.padded_org_vocab_start_index <= assert (self.padded_org_vocab_start_index
self.padded_org_vocab_end_index) <= self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index <= assert (self.padded_added_vocab_start_index
self.padded_added_vocab_end_index) <= self.padded_added_vocab_end_index)
assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.org_vocab_start_index <= self.org_vocab_end_index
assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
assert (self.added_vocab_start_index <= assert (self.added_vocab_start_index
self.padded_added_vocab_start_index) <= self.padded_added_vocab_start_index)
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
...@@ -141,8 +141,8 @@ def get_masked_input_and_mask( ...@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
# torch.compile will fuse all of the pointwise ops below # torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast # into a single kernel, making it very fast
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_mask = (input_ >= org_vocab_start_index) & (
org_vocab_end_index) input_ < org_vocab_end_index)
added_vocab_mask = (input_ >= added_vocab_start_index) & ( added_vocab_mask = (input_ >= added_vocab_start_index) & (
input_ < added_vocab_end_index) input_ < added_vocab_end_index)
added_offset = added_vocab_start_index - ( added_offset = added_vocab_start_index - (
......
...@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# from being incorrectly identified as being present in # from being incorrectly identified as being present in
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
shard_pos = quant_param_name.find(shard_name) shard_pos = quant_param_name.find(shard_name)
can_correct_rename = (shard_pos > 0) and ( can_correct_rename = (shard_pos
quant_param_name[shard_pos - 1] == ".") > 0) and (quant_param_name[shard_pos - 1]
== ".")
# If the quant_param_name is packed, it won't occur in the # If the quant_param_name is packed, it won't occur in the
# param_dict before renaming. # param_dict before renaming.
new_quant_param_name = quant_param_name.replace( new_quant_param_name = quant_param_name.replace(
......
...@@ -298,8 +298,8 @@ class TensorizerAgent: ...@@ -298,8 +298,8 @@ class TensorizerAgent:
to allow for adapter added tokens.""" to allow for adapter added tokens."""
for child in self.model.modules(): for child in self.model.modules():
if (isinstance(child, VocabParallelEmbedding) if (isinstance(child, VocabParallelEmbedding)
and child.weight.shape[0] < and child.weight.shape[0]
child.num_embeddings_per_partition): < child.num_embeddings_per_partition):
new_weight = torch.empty(child.num_embeddings_per_partition, new_weight = torch.empty(child.num_embeddings_per_partition,
child.embedding_dim, child.embedding_dim,
dtype=child.weight.dtype, dtype=child.weight.dtype,
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights.""" """Inference-only Gemma model compatible with HuggingFace weights."""
from functools import lru_cache from functools import cache
from typing import Iterable, List, Optional, Set, Tuple, Union from typing import Iterable, List, Optional, Set, Tuple, Union
import torch import torch
...@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter, ...@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
logger = init_logger(__name__) logger = init_logger(__name__)
@lru_cache(maxsize=None) @cache
def _get_gemma_act_fn( def _get_gemma_act_fn(
hidden_act: Optional[str], hidden_act: Optional[str],
hidden_activation: Optional[str], hidden_activation: Optional[str],
......
...@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)): for e in range(p.size(0)):
w1_name = n.replace( w1_name = n.replace(
'.block_sparse_moe.input_linear.weight', '.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w1.weight" % e) f".block_sparse_moe.experts.{e}.w1.weight")
w3_name = n.replace( w3_name = n.replace(
'.block_sparse_moe.input_linear.weight', '.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w3.weight" % e) f".block_sparse_moe.experts.{e}.w3.weight")
w1_param, w3_param = p[e].chunk(2, dim=0) w1_param, w3_param = p[e].chunk(2, dim=0)
assert w1_name not in new_weights assert w1_name not in new_weights
assert w3_name not in new_weights assert w3_name not in new_weights
...@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)): for e in range(p.size(0)):
w2_name = n.replace( w2_name = n.replace(
'.block_sparse_moe.output_linear.weight', '.block_sparse_moe.output_linear.weight',
".block_sparse_moe.experts.%d.w2.weight" % e) f".block_sparse_moe.experts.{e}.w2.weight")
w2_param = p[e] w2_param = p[e]
assert w2_name not in new_weights assert w2_name not in new_weights
new_weights[w2_name] = w2_param new_weights[w2_name] = w2_param
......
...@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): ...@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
# For 1) text-only prefill and decode, 2) image-present decode. # For 1) text-only prefill and decode, 2) image-present decode.
if image_inputs is None: if image_inputs is None:
full_text_row_masked_out_mask = ( full_text_row_masked_out_mask = (
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( attn_metadata.encoder_seq_lens_tensor
input_ids.device) != 0).reshape(-1, 1).to(input_ids.device)
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
# For image-present prefill. # For image-present prefill.
......
...@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module): ...@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
if self.tie_weights: if self.tie_weights:
assert ( assert (
self.n_predict > self.n_predict > 1
1), "You cannot tie weights between stages when only 1 exists" ), "You cannot tie weights between stages when only 1 exists"
embedding = VocabParallelEmbedding( embedding = VocabParallelEmbedding(
config.vocab_size, config.vocab_size,
self.inner_dim, self.inner_dim,
......
...@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01): ...@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
# compute mask for sparsity # compute mask for sparsity
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold) factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = ( mask_logits_threshold = ((mask_logits_threshold - scores) /
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps) factor) > (2 * jitter_eps)
# apply mask # apply mask
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
...@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01): ...@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
mask_logits_threshold, max_ind = masked_scores.max(dim=-1, mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
keepdim=True) keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold) factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = ( mask_logits_threshold = ((mask_logits_threshold - scores) /
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps) factor) > (2 * jitter_eps)
# apply mask # apply mask
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
......
...@@ -462,7 +462,8 @@ class _ModelRegistry: ...@@ -462,7 +462,8 @@ class _ModelRegistry:
ModelRegistry = _ModelRegistry({ ModelRegistry = _ModelRegistry({
model_arch: _LazyRegisteredModel( model_arch:
_LazyRegisteredModel(
module_name=f"vllm.model_executor.models.{mod_relname}", module_name=f"vllm.model_executor.models.{mod_relname}",
class_name=cls_name, class_name=cls_name,
) )
......
...@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder): ...@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
return hidden_states return hidden_states
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, @MULTIMODAL_REGISTRY.register_processor(
info=UltravoxProcessingInfo, UltravoxMultiModalProcessor,
dummy_inputs=UltravoxDummyInputsBuilder info=UltravoxProcessingInfo,
) dummy_inputs=UltravoxDummyInputsBuilder)
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
......
...@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): ...@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
device: torch.device, device: torch.device,
) -> IntermediateTensors: ) -> IntermediateTensors:
return IntermediateTensors({ return IntermediateTensors({
key: torch.zeros((batch_size, hidden_size), key:
dtype=dtype, torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
device=device)
for key in keys for key in keys
}) })
......
...@@ -166,7 +166,8 @@ class SamplingMetadata: ...@@ -166,7 +166,8 @@ class SamplingMetadata:
pin_memory=pin_memory, pin_memory=pin_memory,
) )
categorized_sample_indices = { categorized_sample_indices = {
t: async_tensor_h2d( t:
async_tensor_h2d(
seq_ids, seq_ids,
dtype=torch.int, dtype=torch.int,
target_device=device, target_device=device,
...@@ -198,8 +199,12 @@ def _prepare_seq_groups( ...@@ -198,8 +199,12 @@ def _prepare_seq_groups(
device: str, device: str,
generators: Optional[Dict[str, torch.Generator]] = None, generators: Optional[Dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None, cache: Optional[SamplingMetadataCache] = None,
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, ) -> Tuple[
List[int]], int, ]: List[SequenceGroupToSample],
List[int],
Dict[SamplingType, List[int]],
int,
]:
"""Prepare sequence groups and indices for sampling. """Prepare sequence groups and indices for sampling.
Args: Args:
......
...@@ -38,8 +38,8 @@ class NeuronPlatform(Platform): ...@@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
if parallel_config.world_size > 1: if parallel_config.world_size > 1:
parallel_config.distributed_executor_backend = "uni" parallel_config.distributed_executor_backend = "uni"
assert (vllm_config.lora_config is assert (vllm_config.lora_config
None), "LoRA is not supported for Neuron backend." is None), "LoRA is not supported for Neuron backend."
assert (not vllm_config.speculative_config assert (not vllm_config.speculative_config
), "Speculative decoding not yet supported for Neuron backend." ), "Speculative decoding not yet supported for Neuron backend."
......
...@@ -121,8 +121,8 @@ class ScalarType: ...@@ -121,8 +121,8 @@ class ScalarType:
min_raw = max_raw | sign_bit_double min_raw = max_raw | sign_bit_double
return struct.unpack('!d', struct.pack('!Q', min_raw))[0] return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
else: else:
assert (not self.is_signed() or assert (not self.is_signed() or self.size_bits
self.size_bits <= 64), "Cannot represent min as a int64_t" <= 64), "Cannot represent min as a int64_t"
if self.is_signed(): if self.is_signed():
return -(1 << (self.size_bits - 1)) return -(1 << (self.size_bits - 1))
......
...@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self, execute_model_req: ExecuteModelRequest) -> bool: self, execute_model_req: ExecuteModelRequest) -> bool:
# When the batch size is too large, disable speculative decoding # When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency. # to stop trading off throughput for latency.
return (execute_model_req.running_queue_size >= return (execute_model_req.running_queue_size
self.disable_by_batch_size) >= self.disable_by_batch_size)
def _maybe_disable_speculative_tokens( def _maybe_disable_speculative_tokens(
self, disable_all_speculation: bool, self, disable_all_speculation: bool,
......
...@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer): ...@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
sampler_transposed=transposed, sampler_transposed=transposed,
) )
proposals = SpeculativeProposals( proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
proposal_token_ids=proposal_tokens, proposal_probs=proposal_probs,
proposal_probs=proposal_probs, proposal_lens=proposal_lens,
proposal_lens=proposal_lens, no_proposals=maybe_sampler_output
no_proposals=maybe_sampler_output is None) is None)
return proposals return proposals
def _split_by_proposal_len( def _split_by_proposal_len(
......
...@@ -40,13 +40,15 @@ def get_sampled_token_logprobs( ...@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
""" """
num_steps, batch_size, vocab_size = logprob_tensor.shape num_steps, batch_size, vocab_size = logprob_tensor.shape
selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), selected_logprobs = logprob_tensor[
torch.arange(batch_size), torch.arange(num_steps).unsqueeze(1),
sampled_token_ids, ] torch.arange(batch_size),
sampled_token_ids,
]
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
-1, -1, vocab_size) -1, -1, vocab_size)
sampled_token_ids_ranks = (logprob_tensor > sampled_token_ids_ranks = (logprob_tensor
expanded_selected_logprobs).sum(-1).add_(1) > expanded_selected_logprobs).sum(-1).add_(1)
return sampled_token_ids_ranks, selected_logprobs return sampled_token_ids_ranks, selected_logprobs
......
...@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig): ...@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
if self.rope_scaling is None: if self.rope_scaling is None:
return return
if not isinstance(self.rope_scaling, if not isinstance(self.rope_scaling, dict) or len(
dict) or len(self.rope_scaling) != 2: self.rope_scaling) != 2:
raise ValueError( raise ValueError(
"`rope_scaling` must be a dictionary with two fields, " "`rope_scaling` must be a dictionary with two fields, "
f"`type` and `factor`, got {self.rope_scaling}") f"`type` and `factor`, got {self.rope_scaling}")
......
...@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task ...@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
from collections import OrderedDict, UserDict, defaultdict from collections import OrderedDict, UserDict, defaultdict
from collections.abc import Hashable, Iterable, Mapping from collections.abc import Hashable, Iterable, Mapping
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import lru_cache, partial, wraps from functools import cache, lru_cache, partial, wraps
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
Dict, Generator, Generic, Iterator, List, Literal, Dict, Generator, Generic, Iterator, List, Literal,
NamedTuple, Optional, Tuple, Type, TypeVar, Union, NamedTuple, Optional, Tuple, Type, TypeVar, Union,
...@@ -352,7 +352,7 @@ class PyObjectCache: ...@@ -352,7 +352,7 @@ class PyObjectCache:
self._index = 0 self._index = 0
@lru_cache(maxsize=None) @cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int: def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes.""" """Returns the maximum shared memory per thread block in bytes."""
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
...@@ -697,7 +697,7 @@ def create_kv_caches_with_random( ...@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
return key_caches, value_caches return key_caches, value_caches
@lru_cache(maxsize=None) @cache
def is_pin_memory_available() -> bool: def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform from vllm.platforms import current_platform
return current_platform.is_pin_memory_available() return current_platform.is_pin_memory_available()
...@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None: ...@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
init_hf_modules() init_hf_modules()
@lru_cache(maxsize=None) @cache
def find_library(lib_name: str) -> str: def find_library(lib_name: str) -> str:
""" """
Find the library file in the system. Find the library file in the system.
...@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): ...@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
return module return module
@lru_cache(maxsize=None) @cache
def get_vllm_optional_dependencies(): def get_vllm_optional_dependencies():
metadata = importlib.metadata.metadata("vllm") metadata = importlib.metadata.metadata("vllm")
requirements = metadata.get_all("Requires-Dist", []) requirements = metadata.get_all("Requires-Dist", [])
......
...@@ -247,8 +247,8 @@ class Scheduler: ...@@ -247,8 +247,8 @@ class Scheduler:
token_budget -= num_new_tokens token_budget -= num_new_tokens
request.status = RequestStatus.RUNNING request.status = RequestStatus.RUNNING
request.num_computed_tokens = num_computed_tokens request.num_computed_tokens = num_computed_tokens
has_partial_request = (num_computed_tokens + num_new_tokens < has_partial_request = (num_computed_tokens + num_new_tokens
request.num_tokens) < request.num_tokens)
# Encoder-related. # Encoder-related.
if encoder_inputs_to_schedule: if encoder_inputs_to_schedule:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment