Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
......@@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
self, max_pixels: int | None = None
) -> ImageSize:
# NOTE: Simply processing a huge size with _get_vision_info might not give a
# size that maximizes the number of featrues, i.e., the number of (merged)
# size that maximizes the number of features, i.e., the number of (merged)
# patches. This is because the number of patches limits the allowed aspect
# ratios. For example, suppose the maximum number of patches is 1280. A square
# image cannot be broken down into 1280 patches, so feeding a giant square image
......
......@@ -459,14 +459,14 @@ class Step3VLProcessor:
image_inputs = {}
text_inputs = self.tokenizer(text)
else:
splitted_images_data = self._split_images(images)
split_images_data = self._split_images(images)
pixel_values_lst = []
patch_pixel_values_lst = []
patch_newline_mask_lst = []
image_repl_str_lst = []
image_repl_ids_lst = []
num_patches = []
for raw_img, img_patches, patch_newline_mask in splitted_images_data:
for raw_img, img_patches, patch_newline_mask in split_images_data:
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
if len(img_patches) > 0:
......
......@@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module):
if swiglu_limit not in (None, 0):
swiglu_limit = float(swiglu_limit)
assert swiglu_limit == 7.0, (
"Swiglu limit in fused moe block only suport 7.0 now."
"Swiglu limit in fused moe block only support 7.0 now."
)
activation = "swiglustep"
logger.debug(
......
......@@ -18,7 +18,7 @@ logger = init_logger(__name__)
class Ernie45ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for Ernie45 thinking model.
The Ernie45 thinking model ouput format is
The Ernie45 thinking model output format is
abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
"""
......@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
The Ernie45 thinking model ouput format is
The Ernie45 thinking model output format is
abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning
......@@ -148,7 +148,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
The Ernie45 thinking model ouput format is
The Ernie45 thinking model output format is
abc\n</think>\n\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning
......
......@@ -564,7 +564,7 @@ def replace_vision_chunk_video_placeholder(
mm_data: "MultiModalDataDict",
video_placeholder: str | None,
) -> str | list[int]:
# get video placehoder, replace it with runtime video-chunk prompts
# get video placeholder, replace it with runtime video-chunk prompts
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
......
"""
Schemas and utilites for preprocessing inputs.
Schemas and utilities for preprocessing inputs.
"""
# SPDX-License-Identifier: Apache-2.0
......
"""
Schemas and utilites for tokenization inputs.
Schemas and utilities for tokenization inputs.
"""
# SPDX-License-Identifier: Apache-2.0
......
......@@ -169,7 +169,7 @@ def _prepare_apply_chat_template_tools_and_messages(
tool.pop(tool_key)
logger.warning_once(
f"'{tool_key}' is not supported by mistral-common for tools. "
"It has been poped from the tool definition."
"It has been popped from the tool definition."
)
if tool["type"] == "function":
function_keys = list(tool["function"].keys())
......@@ -178,7 +178,7 @@ def _prepare_apply_chat_template_tools_and_messages(
tool["function"].pop(function_key)
logger.warning_once(
f"'{function_key}' is not supported by mistral-common "
"for function tools. It has been poped from the "
"for function tools. It has been popped from the "
"function definition."
)
else:
......
......@@ -402,7 +402,7 @@ class Ovis2_5Processor(ProcessorMixin):
images = [images]
elif video is not None:
is_video = True
# type of vidoe in dummy_mm_data is np.ndarray
# type of video in dummy_mm_data is np.ndarray
if isinstance(video, np.ndarray):
images = []
for i in range(video.shape[0]):
......
......@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
query_start_loc = query_start_loc[: num_decodes + 1]
block_table_tensor = block_table_tensor[:num_decodes]
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
scheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
num_reqs=num_reqs,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
......@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
seq_lens=seq_lens,
block_table=block_table_tensor,
slot_mapping=slot_mapping,
scheduler_metadata=sheduler_metadata,
scheduler_metadata=scheduler_metadata,
causal=causal,
use_sdpa_prefill=self.use_sdpa_prefill,
num_decode_tokens=num_decode_tokens,
......
......@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
# Return a tensor of shape (#requests, #max blocks)
state_indices_tensor = common_attn_metadata.block_table_tensor
# Additional cache-related varaiables:
# Additional cache-related variables:
mamba_block_size = self.kv_cache_spec.block_size
(
block_idx_last_computed_token,
......
......@@ -49,14 +49,14 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
# For FP8 sparse attention we have two impelementations:
# For FP8 sparse attention we have two implementations:
# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
# done by treating all tokens as single batch.
# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
# (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
# the FP8 decode kernel for decode.
# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
# prefill kernel requires padding the numer of heads to 128 while the decode does not
# prefill kernel requires padding the number of heads to 128 while the decode does not
# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
# batch mode (#2).
MIN_HEADS_FOR_BF16_PREFILL = 32
......@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend):
cache_dtype_str: str = "auto",
) -> tuple[int, ...]:
if cache_dtype_str == "fp8_ds_mla":
# custom storage fromat is 656 bytes
# custom storage format is 656 bytes
# see FlashMLA readme.md for details
return (num_blocks, block_size, 656)
else:
......
......@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata:
slot_mapping: torch.Tensor
block_table: torch.Tensor
# prefill and deocde split
# prefill and decode split
num_decodes: int
num_decode_tokens: int
num_prefills: int
......@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
extend_tokens_slice = slice(
num_decode_tokens, num_decode_tokens + num_extend_tokens
)
extend_querys = query[extend_tokens_slice]
extend_queries = query[extend_tokens_slice]
extend_keys = key[extend_tokens_slice]
extend_values = value[extend_tokens_slice]
extend_outputs = output[extend_tokens_slice]
......@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
v_scale = attn_metadata.v_scale
self.extend_forward(
attn_metadata=attn_metadata,
query=extend_querys,
query=extend_queries,
key=extend_keys,
value=extend_values,
key_cache=key_cache,
......
......@@ -863,7 +863,7 @@ class MambaManager(SingleTypeKVCacheManager):
):
# Mamba can't rely on blocks generated by other requests in the current step
# To put it in the next step, we return num_gpu_blocks + 1 so
# that kv_cache_manager will think there is no enough blocks to allocte now
# that kv_cache_manager will think there is no enough blocks to allocate now
# and don't schedule it in the current step.
return self.block_pool.num_gpu_blocks + 1
if self.mamba_cache_mode != "align":
......
......@@ -1724,11 +1724,11 @@ class DPEngineCoreProc(EngineCoreProc):
"""
Send notifications to EngineCoreClient, which can then forward
the notifications to other engine core processes. It is used for:
1) In scale up: new core engines to notify exisiting core engines
1) In scale up: new core engines to notify existing core engines
that they are ready;
2) In scale down: removing core engines to notify EngineCoreClient
so EngineCoreClient can release their ray placement groups;
3) Both scale up/down: to notify EngineCoreClient that exisiting
3) Both scale up/down: to notify EngineCoreClient that existing
core engines have already switched to the new parallel setup.
"""
if vllm_config is None:
......
......@@ -194,7 +194,7 @@ class InputProcessor:
@staticmethod
def assign_request_id(request: EngineCoreRequest):
"""Replace the externally supplied request ID with an internal request ID
that adds 8 random characters in order to ensure uniquness.
that adds 8 random characters in order to ensure uniqueness.
"""
if request.external_req_id is not None:
raise ValueError(
......
......@@ -197,7 +197,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
transfer = self._transfers.popleft()
transfer_time = (
transfer.start_event.elapsed_time(transfer.end_event) * 1e-3
) # elapsed_time is in miliseconds
) # elapsed_time is in milliseconds
result = TransferResult(
job_id=transfer.job_id,
success=True,
......
......@@ -905,7 +905,7 @@ class GPUModelRunner(
Args:
scheduler_output: The scheduler output.
"""
# Attention free models have zero kv_cache_goups, however models
# Attention free models have zero kv_cache_groups, however models
# like Mamba are also attention free but use the kv_cache for
# keeping its internal state. This is why we check the number
# of kv_cache groups instead of solely checking
......@@ -1065,7 +1065,7 @@ class GPUModelRunner(
# of the request. for example:
# fist step: num_computed_tokens = 0, spec_tokens = [],
# prev_num_draft_len = 0.
# second step: num_computed_tokens = 100(prompt lenth),
# second step: num_computed_tokens = 100(prompt length),
# spec_tokens = [a,b], prev_num_draft_len = 0.
# third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
# prev_num_draft_len = 2.
......@@ -1412,30 +1412,30 @@ class GPUModelRunner(
prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices)
num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec:
if num_common_tokens < total_without_spec:
# If not all requests are decodes from the last iteration,
# We need to copy the input_ids_cpu to the GPU first.
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
if num_common_tokens == 0:
# No requests in common with the previous iteration
# So input_ids.cpu will have all the input ids.
return
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
if indices_match and max_flattened_index == (num_common_tokens - 1):
# Common-case optimization: the batch is unchanged
# and no reordering happened.
# The indices are both the same permutation of 0..N-1 so
# we can copy directly using a single slice.
self.input_ids.gpu[:num_commmon_tokens].copy_(
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0],
self.input_ids.gpu[:num_common_tokens].copy_(
self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0],
non_blocking=True,
)
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
self.is_token_ids.gpu[:num_common_tokens] = True
return
# Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor(
......@@ -4383,7 +4383,7 @@ class GPUModelRunner(
self.model.compile(fullgraph=True, backend=backend)
return
# for other compilation modes, cudagraph behavior is controlled by
# CudagraphWraper and CudagraphDispatcher of vllm.
# CudagraphWrapper and CudagraphDispatcher of vllm.
# wrap the model with full cudagraph wrapper if needed.
cudagraph_mode = self.compilation_config.cudagraph_mode
......@@ -4444,7 +4444,7 @@ class GPUModelRunner(
:param weights_path: path to load weights from if weights_iterator is not
provided. Use path of original model if neither is provided.
:param is_checkpoint_format: set to False if weights have already been processed
into kernel format (repacking, renaming, ect.)
into kernel format (repacking, renaming, etc.)
"""
# TODO(@kylesayrs): generalize to all runners and loaders
# argument validation
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment