Unverified Commit 823ab796 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `pre-commit` hooks (#12475)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 6116ca8c
......@@ -115,17 +115,17 @@ class VocabParallelEmbeddingShardIndices:
def __post_init__(self):
# sanity checks
assert (self.padded_org_vocab_start_index <=
self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index <=
self.padded_added_vocab_end_index)
assert (self.padded_org_vocab_start_index
<= self.padded_org_vocab_end_index)
assert (self.padded_added_vocab_start_index
<= self.padded_added_vocab_end_index)
assert self.org_vocab_start_index <= self.org_vocab_end_index
assert self.added_vocab_start_index <= self.added_vocab_end_index
assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
assert (self.added_vocab_start_index <=
self.padded_added_vocab_start_index)
assert (self.added_vocab_start_index
<= self.padded_added_vocab_start_index)
assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
......@@ -141,8 +141,8 @@ def get_masked_input_and_mask(
added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
org_vocab_end_index)
org_vocab_mask = (input_ >= org_vocab_start_index) & (
input_ < org_vocab_end_index)
added_vocab_mask = (input_ >= added_vocab_start_index) & (
input_ < added_vocab_end_index)
added_offset = added_vocab_start_index - (
......
......@@ -1121,8 +1121,9 @@ class BitsAndBytesModelLoader(BaseModelLoader):
# from being incorrectly identified as being present in
# 'vpm.encoder.layers.0.self_attn.qkv_proj.weight
shard_pos = quant_param_name.find(shard_name)
can_correct_rename = (shard_pos > 0) and (
quant_param_name[shard_pos - 1] == ".")
can_correct_rename = (shard_pos
> 0) and (quant_param_name[shard_pos - 1]
== ".")
# If the quant_param_name is packed, it won't occur in the
# param_dict before renaming.
new_quant_param_name = quant_param_name.replace(
......
......@@ -298,8 +298,8 @@ class TensorizerAgent:
to allow for adapter added tokens."""
for child in self.model.modules():
if (isinstance(child, VocabParallelEmbedding)
and child.weight.shape[0] <
child.num_embeddings_per_partition):
and child.weight.shape[0]
< child.num_embeddings_per_partition):
new_weight = torch.empty(child.num_embeddings_per_partition,
child.embedding_dim,
dtype=child.weight.dtype,
......
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights."""
from functools import lru_cache
from functools import cache
from typing import Iterable, List, Optional, Set, Tuple, Union
import torch
......@@ -48,7 +48,7 @@ from .utils import (is_pp_missing_parameter,
logger = init_logger(__name__)
@lru_cache(maxsize=None)
@cache
def _get_gemma_act_fn(
hidden_act: Optional[str],
hidden_activation: Optional[str],
......
......@@ -429,10 +429,10 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)):
w1_name = n.replace(
'.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w1.weight" % e)
f".block_sparse_moe.experts.{e}.w1.weight")
w3_name = n.replace(
'.block_sparse_moe.input_linear.weight',
".block_sparse_moe.experts.%d.w3.weight" % e)
f".block_sparse_moe.experts.{e}.w3.weight")
w1_param, w3_param = p[e].chunk(2, dim=0)
assert w1_name not in new_weights
assert w3_name not in new_weights
......@@ -442,7 +442,7 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
for e in range(p.size(0)):
w2_name = n.replace(
'.block_sparse_moe.output_linear.weight',
".block_sparse_moe.experts.%d.w2.weight" % e)
f".block_sparse_moe.experts.{e}.w2.weight")
w2_param = p[e]
assert w2_name not in new_weights
new_weights[w2_name] = w2_param
......
......@@ -1365,8 +1365,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
# For 1) text-only prefill and decode, 2) image-present decode.
if image_inputs is None:
full_text_row_masked_out_mask = (
attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(
input_ids.device)
attn_metadata.encoder_seq_lens_tensor
!= 0).reshape(-1, 1).to(input_ids.device)
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
# For image-present prefill.
......
......@@ -81,8 +81,8 @@ class MLPSpeculator(nn.Module):
if self.tie_weights:
assert (
self.n_predict >
1), "You cannot tie weights between stages when only 1 exists"
self.n_predict > 1
), "You cannot tie weights between stages when only 1 exists"
embedding = VocabParallelEmbedding(
config.vocab_size,
self.inner_dim,
......
......@@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01):
# compute mask for sparsity
mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = (
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
mask_logits_threshold = ((mask_logits_threshold - scores) /
factor) > (2 * jitter_eps)
# apply mask
masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
......@@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01):
mask_logits_threshold, max_ind = masked_scores.max(dim=-1,
keepdim=True)
factor = scores.abs().clamp(min=mask_logits_threshold)
mask_logits_threshold = (
(mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
mask_logits_threshold = ((mask_logits_threshold - scores) /
factor) > (2 * jitter_eps)
# apply mask
masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold,
......
......@@ -462,7 +462,8 @@ class _ModelRegistry:
ModelRegistry = _ModelRegistry({
model_arch: _LazyRegisteredModel(
model_arch:
_LazyRegisteredModel(
module_name=f"vllm.model_executor.models.{mod_relname}",
class_name=cls_name,
)
......
......@@ -333,10 +333,10 @@ class ModifiedWhisperEncoder(WhisperEncoder):
return hidden_states
@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor,
info=UltravoxProcessingInfo,
dummy_inputs=UltravoxDummyInputsBuilder
)
@MULTIMODAL_REGISTRY.register_processor(
UltravoxMultiModalProcessor,
info=UltravoxProcessingInfo,
dummy_inputs=UltravoxDummyInputsBuilder)
class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
hf_to_vllm_mapper = WeightsMapper(
......
......@@ -599,9 +599,8 @@ def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int):
device: torch.device,
) -> IntermediateTensors:
return IntermediateTensors({
key: torch.zeros((batch_size, hidden_size),
dtype=dtype,
device=device)
key:
torch.zeros((batch_size, hidden_size), dtype=dtype, device=device)
for key in keys
})
......
......@@ -166,7 +166,8 @@ class SamplingMetadata:
pin_memory=pin_memory,
)
categorized_sample_indices = {
t: async_tensor_h2d(
t:
async_tensor_h2d(
seq_ids,
dtype=torch.int,
target_device=device,
......@@ -198,8 +199,12 @@ def _prepare_seq_groups(
device: str,
generators: Optional[Dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None,
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType,
List[int]], int, ]:
) -> Tuple[
List[SequenceGroupToSample],
List[int],
Dict[SamplingType, List[int]],
int,
]:
"""Prepare sequence groups and indices for sampling.
Args:
......
......@@ -38,8 +38,8 @@ class NeuronPlatform(Platform):
if parallel_config.world_size > 1:
parallel_config.distributed_executor_backend = "uni"
assert (vllm_config.lora_config is
None), "LoRA is not supported for Neuron backend."
assert (vllm_config.lora_config
is None), "LoRA is not supported for Neuron backend."
assert (not vllm_config.speculative_config
), "Speculative decoding not yet supported for Neuron backend."
......
......@@ -121,8 +121,8 @@ class ScalarType:
min_raw = max_raw | sign_bit_double
return struct.unpack('!d', struct.pack('!Q', min_raw))[0]
else:
assert (not self.is_signed() or
self.size_bits <= 64), "Cannot represent min as a int64_t"
assert (not self.is_signed() or self.size_bits
<= 64), "Cannot represent min as a int64_t"
if self.is_signed():
return -(1 << (self.size_bits - 1))
......
......@@ -510,8 +510,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
self, execute_model_req: ExecuteModelRequest) -> bool:
# When the batch size is too large, disable speculative decoding
# to stop trading off throughput for latency.
return (execute_model_req.running_queue_size >=
self.disable_by_batch_size)
return (execute_model_req.running_queue_size
>= self.disable_by_batch_size)
def _maybe_disable_speculative_tokens(
self, disable_all_speculation: bool,
......
......@@ -104,11 +104,11 @@ class Top1Proposer(SpeculativeProposer):
sampler_transposed=transposed,
)
proposals = SpeculativeProposals(
proposal_token_ids=proposal_tokens,
proposal_probs=proposal_probs,
proposal_lens=proposal_lens,
no_proposals=maybe_sampler_output is None)
proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
proposal_probs=proposal_probs,
proposal_lens=proposal_lens,
no_proposals=maybe_sampler_output
is None)
return proposals
def _split_by_proposal_len(
......
......@@ -40,13 +40,15 @@ def get_sampled_token_logprobs(
"""
num_steps, batch_size, vocab_size = logprob_tensor.shape
selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
torch.arange(batch_size),
sampled_token_ids, ]
selected_logprobs = logprob_tensor[
torch.arange(num_steps).unsqueeze(1),
torch.arange(batch_size),
sampled_token_ids,
]
expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
-1, -1, vocab_size)
sampled_token_ids_ranks = (logprob_tensor >
expanded_selected_logprobs).sum(-1).add_(1)
sampled_token_ids_ranks = (logprob_tensor
> expanded_selected_logprobs).sum(-1).add_(1)
return sampled_token_ids_ranks, selected_logprobs
......
......@@ -182,8 +182,8 @@ class NemotronConfig(PretrainedConfig):
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling,
dict) or len(self.rope_scaling) != 2:
if not isinstance(self.rope_scaling, dict) or len(
self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with two fields, "
f"`type` and `factor`, got {self.rope_scaling}")
......
......@@ -29,7 +29,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
from collections import OrderedDict, UserDict, defaultdict
from collections.abc import Hashable, Iterable, Mapping
from dataclasses import dataclass, field
from functools import lru_cache, partial, wraps
from functools import cache, lru_cache, partial, wraps
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
Dict, Generator, Generic, Iterator, List, Literal,
NamedTuple, Optional, Tuple, Type, TypeVar, Union,
......@@ -352,7 +352,7 @@ class PyObjectCache:
self._index = 0
@lru_cache(maxsize=None)
@cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
from vllm import _custom_ops as ops
......@@ -697,7 +697,7 @@ def create_kv_caches_with_random(
return key_caches, value_caches
@lru_cache(maxsize=None)
@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform
return current_platform.is_pin_memory_available()
......@@ -886,7 +886,7 @@ def init_cached_hf_modules() -> None:
init_hf_modules()
@lru_cache(maxsize=None)
@cache
def find_library(lib_name: str) -> str:
"""
Find the library file in the system.
......@@ -1607,7 +1607,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
return module
@lru_cache(maxsize=None)
@cache
def get_vllm_optional_dependencies():
metadata = importlib.metadata.metadata("vllm")
requirements = metadata.get_all("Requires-Dist", [])
......
......@@ -247,8 +247,8 @@ class Scheduler:
token_budget -= num_new_tokens
request.status = RequestStatus.RUNNING
request.num_computed_tokens = num_computed_tokens
has_partial_request = (num_computed_tokens + num_new_tokens <
request.num_tokens)
has_partial_request = (num_computed_tokens + num_new_tokens
< request.num_tokens)
# Encoder-related.
if encoder_inputs_to_schedule:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment