Unverified Commit 2f1c19b2 authored by Ning Xie's avatar Ning Xie Committed by GitHub
Browse files

[CI] change spell checker from codespell to typos (#18711)


Signed-off-by: default avatarAndy Xie <andy.xning@gmail.com>
parent 42f52cc9
...@@ -219,7 +219,7 @@ def per_token_group_quant_int8( ...@@ -219,7 +219,7 @@ def per_token_group_quant_int8(
quantized tensor along with the scaling factor used for quantization. quantized tensor along with the scaling factor used for quantization.
Args: Args:
x: The input tenosr with ndim >= 2. x: The input tensor with ndim >= 2.
group_size: The group size used for quantization. group_size: The group size used for quantization.
eps: The minimum to avoid dividing zero. eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.int8` dtype: The dype of output tensor. Note that only `torch.int8`
......
...@@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...@@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
self.target_modules.append( self.target_modules.append(
name.replace(rep_name, sub_name)) name.replace(rep_name, sub_name))
# Add original module name even if the module has stacked map, # Add original module name even if the module has stacked map,
# in case model has a mixture of disk-merged and disk-splitted # in case model has a mixture of disk-merged and disk-split
# weights with same last name. # weights with same last name.
self.target_modules.append(name) self.target_modules.append(name)
......
...@@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module): ...@@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
self.num_heads = (self.total_num_heads // self.num_heads = (self.total_num_heads //
tensor_model_parallel_world_size) tensor_model_parallel_world_size)
self.head_dim = hidden_size // self.total_num_heads self.head_dim = hidden_size // self.total_num_heads
self.postion_embedding = position_embedding self.position_embedding = position_embedding
self.rope_theta = rope_theta self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
...@@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module): ...@@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
quant_config=quant_config, quant_config=quant_config,
) )
# Create the alibi slopes and slice them. # Create the alibi slopes and slice them.
if self.postion_embedding == "ALIBI": if self.position_embedding == "ALIBI":
tp_rank = get_tensor_model_parallel_rank() tp_rank = get_tensor_model_parallel_rank()
head_start = tp_rank * self.num_heads head_start = tp_rank * self.num_heads
head_end = (tp_rank + 1) * self.num_heads head_end = (tp_rank + 1) * self.num_heads
...@@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module): ...@@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
) -> torch.Tensor: ) -> torch.Tensor:
qkv, _ = self.W_pack(hidden_states) qkv, _ = self.W_pack(hidden_states)
q, k, v = qkv.chunk(chunks=3, dim=-1) q, k, v = qkv.chunk(chunks=3, dim=-1)
if self.postion_embedding != "ALIBI": if self.position_embedding != "ALIBI":
q, k = self.rotary_emb(positions, q, k) q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v) attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output) output, _ = self.o_proj(attn_output)
......
...@@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.image_newline = nn.Parameter( self.image_newline = nn.Parameter(
torch.randn(self.projector_config.n_embed) * embed_std) torch.randn(self.projector_config.n_embed) * embed_std)
# This is a typo in original implementation # This is a typo in original implementation
self.view_seperator = nn.Parameter( self.view_separator = nn.Parameter(
torch.randn(self.projector_config.n_embed) * embed_std) torch.randn(self.projector_config.n_embed) * embed_std)
else: else:
raise ValueError( raise ValueError(
...@@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if self.global_view_pos == "head": if self.global_view_pos == "head":
global_local_features = torch.cat([ global_local_features = torch.cat([
global_features, global_features,
self.view_seperator[None, :], self.view_separator[None, :],
local_features, local_features,
]) ])
else: else:
global_local_features = torch.cat([ global_local_features = torch.cat([
local_features, local_features,
self.view_seperator[None, :], self.view_separator[None, :],
global_features, global_features,
]) ])
......
...@@ -197,7 +197,7 @@ class EAGLE(nn.Module): ...@@ -197,7 +197,7 @@ class EAGLE(nn.Module):
return logits return logits
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
# due to missing lm_head weights and its config being that of a # due to missing lm_head weights and its config being that of a
# Llama model. Here's a compatible version with the same weights: # Llama model. Here's a compatible version with the same weights:
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
......
...@@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
kwargs["has_images"] = True kwargs["has_images"] = True
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0. # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
# This is a HACK. Fix this. # This is a HACK. Fix this.
start_idices = (positions == 0).cpu().nonzero() start_indices = (positions == 0).cpu().nonzero()
num_seqs = len(start_idices) num_seqs = len(start_indices)
seq_lens = [] seq_lens = []
for i in range(num_seqs): for i in range(num_seqs):
start_idx = start_idices[i].item() start_idx = start_indices[i].item()
if i < num_seqs - 1: if i < num_seqs - 1:
end_idx = start_idices[i + 1].item() end_idx = start_indices[i + 1].item()
else: else:
end_idx = len(input_ids) end_idx = len(input_ids)
seq_lens.append(end_idx - start_idx) seq_lens.append(end_idx - start_idx)
......
...@@ -52,7 +52,7 @@ class Llama4MoE(nn.Module): ...@@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
renormalize: bool, renormalize: bool,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
# psuedo-standard is that the router scores are floats # pseudo-standard is that the router scores are floats
router_scores = torch.sigmoid(router_scores.float()) router_scores = torch.sigmoid(router_scores.float())
return (router_scores, router_indices.to(torch.int32)) return (router_scores, router_indices.to(torch.int32))
......
...@@ -114,9 +114,9 @@ class MixtralMoE(nn.Module): ...@@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
f"Tensor parallel size {self.tp_size} is greater than " f"Tensor parallel size {self.tp_size} is greater than "
f"the number of experts {self.num_total_experts}.") f"the number of experts {self.num_total_experts}.")
# Split experts equally between ranks # Split experts equally between ranks
self.expert_indicies = np.array_split(range( self.expert_indices = np.array_split(range(self.num_total_experts),
self.num_total_experts), self.tp_size)[self.rank].tolist() self.tp_size)[self.rank].tolist()
if not self.expert_indicies: if not self.expert_indices:
raise ValueError( raise ValueError(
f"Rank {self.rank} has no experts assigned to it.") f"Rank {self.rank} has no experts assigned to it.")
...@@ -125,7 +125,7 @@ class MixtralMoE(nn.Module): ...@@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
config.hidden_size, config.hidden_size,
config.intermediate_size, config.intermediate_size,
quant_config=quant_config) quant_config=quant_config)
if idx in self.expert_indicies else None if idx in self.expert_indices else None
for idx in range(self.num_total_experts) for idx in range(self.num_total_experts)
]) ])
self.gate = ReplicatedLinear(config.hidden_size, self.gate = ReplicatedLinear(config.hidden_size,
...@@ -146,7 +146,7 @@ class MixtralMoE(nn.Module): ...@@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
final_hidden_states = None final_hidden_states = None
for expert_idx in self.expert_indicies: for expert_idx in self.expert_indices:
expert_layer = self.experts[expert_idx] expert_layer = self.experts[expert_idx]
expert_mask = (selected_experts == expert_idx) expert_mask = (selected_experts == expert_idx)
expert_weights = (routing_weights * expert_mask).sum(dim=-1, expert_weights = (routing_weights * expert_mask).sum(dim=-1,
......
...@@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo): ...@@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
height, width = self.get_hf_processor().get_image_size() height, width = self.get_hf_processor().get_image_size()
hs = self.get_hf_config().visual_tokenizer_config.hidden_stride hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
# NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code # NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96 # https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
return ImageSize(width=width * hs * 9, height=height * hs * 9) return ImageSize(width=width * hs * 9, height=height * hs * 9)
......
...@@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module): ...@@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
self.num_q_per_kv = self.num_heads // self.num_key_value_heads self.num_q_per_kv = self.num_heads // self.num_key_value_heads
if self.tp_size > 1: if self.tp_size > 1:
assert self.num_key_value_heads % self.tp_size == 0 assert self.num_key_value_heads % self.tp_size == 0
self.num_kv_heads_per_partion = max( self.num_kv_heads_per_partition = max(
1, self.num_key_value_heads // self.tp_size) 1, self.num_key_value_heads // self.tp_size)
self.num_heads_per_partition = self.num_heads // self.tp_size self.num_heads_per_partition = self.num_heads // self.tp_size
...@@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module): ...@@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
bs_params = { bs_params = {
'max_seqlen': self.max_position_embeddings, 'max_seqlen': self.max_position_embeddings,
'num_heads': self.num_heads_per_partition, 'num_heads': self.num_heads_per_partition,
"num_kv_heads": self.num_kv_heads_per_partion, "num_kv_heads": self.num_kv_heads_per_partition,
"block_size": self.sparse_block_size, "block_size": self.sparse_block_size,
"local_blocks": self.local_blocks, "local_blocks": self.local_blocks,
"vert_stride": self.vert_stride, "vert_stride": self.vert_stride,
...@@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module): ...@@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
self.attn = Attention(self.num_heads_per_partition, self.attn = Attention(self.num_heads_per_partition,
self.head_dim, self.head_dim,
self.scale, self.scale,
num_kv_heads=self.num_kv_heads_per_partion, num_kv_heads=self.num_kv_heads_per_partition,
cache_config=cache_config, cache_config=cache_config,
quant_config=quant_config, quant_config=quant_config,
blocksparse_params=bs_params, blocksparse_params=bs_params,
...@@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module): ...@@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
# NOTE: this is required by RotaryEmbed, which indeed does not have to # NOTE: this is required by RotaryEmbed, which indeed does not have to
# TODO: allow 3D QK for rotary forward # TODO: allow 3D QK for rotary forward
q = q.reshape(-1, self.head_dim * self.num_heads_per_partition) q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
q, k = self.rotary_emb(positions, q, k) q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v) attn_output = self.attn(q, k, v)
......
...@@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module): ...@@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
(Multi-Head Attention), (Multi-Head Attention),
1 = typical Multi-Head Attention, 1 = typical Multi-Head Attention,
1 < attn_group_sizes < attention_heads = Grouped-Query Attention 1 < attn_group_sizes < attention_heads = Grouped-Query Attention
attn_group_sizes = attenion_heads = Multi-Query Attention attn_group_sizes = attention_heads = Multi-Query Attention
""" """
def __init__( def __init__(
...@@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): ...@@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
1 = typical Multi-Head Attention, 1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query 1 < attention_group_size < attention_heads = Grouped-Query
Attention Attention
attention_group_size = attenion_heads = Multi-Query Attention attention_group_size = attention_heads = Multi-Query Attention
""" """
def __init__( def __init__(
...@@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase): ...@@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
1 = typical Multi-Head Attention, 1 = typical Multi-Head Attention,
1 < attention_group_size < attention_heads = Grouped-Query 1 < attention_group_size < attention_heads = Grouped-Query
Attention Attention
attention_group_size = attenion_heads = Multi-Query Attention attention_group_size = attention_heads = Multi-Query Attention
""" """
extra_multi_layer_output_idxs: list[int] extra_multi_layer_output_idxs: list[int]
......
...@@ -147,15 +147,15 @@ class mp(torch.autograd.Function): ...@@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
grad_at_output = grad_at_output * multiplier grad_at_output = grad_at_output * multiplier
grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1) grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
grad_at_scores_expaned.scatter_add_( grad_at_scores_expanded.scatter_add_(
dim=-1, dim=-1,
index=selected_experts, index=selected_experts,
src=grad_at_output, src=grad_at_output,
) )
return ( return (
grad_at_scores_expaned, grad_at_scores_expanded,
None, None,
None, None,
None, None,
......
...@@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata( ...@@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
Returns: Returns:
list[str]: List of item modalities in order of their positions in the list[str]: List of item modalities in order of their positions in the
input sequence. input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
mm_positions. mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given, Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
None otherwise. None otherwise.
......
...@@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin): ...@@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
""" """
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"] valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
image_processor_class = "AutoImageProcessor" image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer" tokenizer_class = "AutoTokenizer"
......
...@@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): ...@@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
num_decode_tokens=0, num_decode_tokens=0,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
multi_modal_placeholder_index_maps= multi_modal_placeholder_index_maps=
None, # FIXME(kzawora): mutli-modality will not work here None, # FIXME(kzawora): multi-modality will not work here
enable_kv_scales_calculation=False, enable_kv_scales_calculation=False,
) )
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
......
...@@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput): ...@@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
assert fmi.input_tokens.shape[0] >= self.num_seqs assert fmi.input_tokens.shape[0] >= self.num_seqs
fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
# Update frozen_model_input::input_positons. # Update frozen_model_input::input_positions.
assert fmi.input_positions is not None assert fmi.input_positions is not None
assert fmi.input_positions.shape[0] >= self.num_seqs assert fmi.input_positions.shape[0] >= self.num_seqs
fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
......
...@@ -798,9 +798,9 @@ class ModelWrapper(nn.Module): ...@@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
""" """
batch_size, seq_len = token_ids.shape batch_size, seq_len = token_ids.shape
# Calculate the positions to sample from. # Calculate the positions to sample from.
start_indicies = torch.arange( start_indices = torch.arange(
batch_size, dtype=torch.int32, device=input_lens.device) * seq_len batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
logits_indices = start_indicies + input_lens - 1 logits_indices = start_indices + input_lens - 1
attn_metadata = get_forward_context().attn_metadata attn_metadata = get_forward_context().attn_metadata
# FIXME(woosuk): This is a temporary hack to avoid using the existing # FIXME(woosuk): This is a temporary hack to avoid using the existing
...@@ -822,14 +822,14 @@ class ModelWrapper(nn.Module): ...@@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
slot_mapping = attn_metadata.slot_mapping slot_mapping = attn_metadata.slot_mapping
slot_mapping = slot_mapping.flatten() slot_mapping = slot_mapping.flatten()
head_indicies = torch.arange(0, head_indices = torch.arange(0,
num_kv_heads, num_kv_heads,
device=slot_mapping.device, device=slot_mapping.device,
dtype=slot_mapping.dtype) dtype=slot_mapping.dtype)
head_indicies *= block_size * num_blocks head_indices *= block_size * num_blocks
slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
-1, num_kv_heads) -1, num_kv_heads)
slot_mapping = slot_mapping + head_indicies.view(1, -1) slot_mapping = slot_mapping + head_indices.view(1, -1)
slot_mapping = slot_mapping.flatten() slot_mapping = slot_mapping.flatten()
attn_metadata.slot_mapping = slot_mapping attn_metadata.slot_mapping = slot_mapping
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment