Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
...@@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): ...@@ -916,7 +916,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
self, max_pixels: int | None = None self, max_pixels: int | None = None
) -> ImageSize: ) -> ImageSize:
# NOTE: Simply processing a huge size with _get_vision_info might not give a # NOTE: Simply processing a huge size with _get_vision_info might not give a
# size that maximizes the number of featrues, i.e., the number of (merged) # size that maximizes the number of features, i.e., the number of (merged)
# patches. This is because the number of patches limits the allowed aspect # patches. This is because the number of patches limits the allowed aspect
# ratios. For example, suppose the maximum number of patches is 1280. A square # ratios. For example, suppose the maximum number of patches is 1280. A square
# image cannot be broken down into 1280 patches, so feeding a giant square image # image cannot be broken down into 1280 patches, so feeding a giant square image
......
...@@ -459,14 +459,14 @@ class Step3VLProcessor: ...@@ -459,14 +459,14 @@ class Step3VLProcessor:
image_inputs = {} image_inputs = {}
text_inputs = self.tokenizer(text) text_inputs = self.tokenizer(text)
else: else:
splitted_images_data = self._split_images(images) split_images_data = self._split_images(images)
pixel_values_lst = [] pixel_values_lst = []
patch_pixel_values_lst = [] patch_pixel_values_lst = []
patch_newline_mask_lst = [] patch_newline_mask_lst = []
image_repl_str_lst = [] image_repl_str_lst = []
image_repl_ids_lst = [] image_repl_ids_lst = []
num_patches = [] num_patches = []
for raw_img, img_patches, patch_newline_mask in splitted_images_data: for raw_img, img_patches, patch_newline_mask in split_images_data:
pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img])) pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
if len(img_patches) > 0: if len(img_patches) > 0:
......
...@@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module): ...@@ -353,7 +353,7 @@ class FusedMoEBlock(nn.Module):
if swiglu_limit not in (None, 0): if swiglu_limit not in (None, 0):
swiglu_limit = float(swiglu_limit) swiglu_limit = float(swiglu_limit)
assert swiglu_limit == 7.0, ( assert swiglu_limit == 7.0, (
"Swiglu limit in fused moe block only suport 7.0 now." "Swiglu limit in fused moe block only support 7.0 now."
) )
activation = "swiglustep" activation = "swiglustep"
logger.debug( logger.debug(
......
...@@ -18,7 +18,7 @@ logger = init_logger(__name__) ...@@ -18,7 +18,7 @@ logger = init_logger(__name__)
class Ernie45ReasoningParser(BaseThinkingReasoningParser): class Ernie45ReasoningParser(BaseThinkingReasoningParser):
""" """
Reasoning parser for Ernie45 thinking model. Reasoning parser for Ernie45 thinking model.
The Ernie45 thinking model ouput format is The Ernie45 thinking model output format is
abc\n</think>\n\n<response>\ndef\n</response>\n abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef or abc\n</think>\ndef
""" """
...@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -73,7 +73,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
Extract reasoning content from a delta message. Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current. Handles streaming output where previous + delta = current.
Uses token IDs for faster processing. Uses token IDs for faster processing.
The Ernie45 thinking model ouput format is The Ernie45 thinking model output format is
abc\n</think>\n\n<response>\ndef\n</response>\n abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef or abc\n</think>\ndef
- 'abc' goes to reasoning - 'abc' goes to reasoning
...@@ -148,7 +148,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): ...@@ -148,7 +148,7 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser):
) -> tuple[str | None, str | None]: ) -> tuple[str | None, str | None]:
""" """
Extract reasoning content from the model output. Extract reasoning content from the model output.
The Ernie45 thinking model ouput format is The Ernie45 thinking model output format is
abc\n</think>\n\n\n<response>\ndef\n</response>\n abc\n</think>\n\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef or abc\n</think>\ndef
- 'abc' goes to reasoning - 'abc' goes to reasoning
......
...@@ -564,7 +564,7 @@ def replace_vision_chunk_video_placeholder( ...@@ -564,7 +564,7 @@ def replace_vision_chunk_video_placeholder(
mm_data: "MultiModalDataDict", mm_data: "MultiModalDataDict",
video_placeholder: str | None, video_placeholder: str | None,
) -> str | list[int]: ) -> str | list[int]:
# get video placehoder, replace it with runtime video-chunk prompts # get video placeholder, replace it with runtime video-chunk prompts
if video_placeholder and isinstance(prompt_raw, str): if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data) video_prompts = build_video_prompts_from_mm_data(mm_data)
......
""" """
Schemas and utilites for preprocessing inputs. Schemas and utilities for preprocessing inputs.
""" """
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
......
""" """
Schemas and utilites for tokenization inputs. Schemas and utilities for tokenization inputs.
""" """
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
......
...@@ -169,7 +169,7 @@ def _prepare_apply_chat_template_tools_and_messages( ...@@ -169,7 +169,7 @@ def _prepare_apply_chat_template_tools_and_messages(
tool.pop(tool_key) tool.pop(tool_key)
logger.warning_once( logger.warning_once(
f"'{tool_key}' is not supported by mistral-common for tools. " f"'{tool_key}' is not supported by mistral-common for tools. "
"It has been poped from the tool definition." "It has been popped from the tool definition."
) )
if tool["type"] == "function": if tool["type"] == "function":
function_keys = list(tool["function"].keys()) function_keys = list(tool["function"].keys())
...@@ -178,7 +178,7 @@ def _prepare_apply_chat_template_tools_and_messages( ...@@ -178,7 +178,7 @@ def _prepare_apply_chat_template_tools_and_messages(
tool["function"].pop(function_key) tool["function"].pop(function_key)
logger.warning_once( logger.warning_once(
f"'{function_key}' is not supported by mistral-common " f"'{function_key}' is not supported by mistral-common "
"for function tools. It has been poped from the " "for function tools. It has been popped from the "
"function definition." "function definition."
) )
else: else:
......
...@@ -402,7 +402,7 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -402,7 +402,7 @@ class Ovis2_5Processor(ProcessorMixin):
images = [images] images = [images]
elif video is not None: elif video is not None:
is_video = True is_video = True
# type of vidoe in dummy_mm_data is np.ndarray # type of video in dummy_mm_data is np.ndarray
if isinstance(video, np.ndarray): if isinstance(video, np.ndarray):
images = [] images = []
for i in range(video.shape[0]): for i in range(video.shape[0]):
......
...@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] ...@@ -174,7 +174,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
query_start_loc = query_start_loc[: num_decodes + 1] query_start_loc = query_start_loc[: num_decodes + 1]
block_table_tensor = block_table_tensor[:num_decodes] block_table_tensor = block_table_tensor[:num_decodes]
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( scheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
num_reqs=num_reqs, num_reqs=num_reqs,
num_heads=self.num_heads, num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads, num_kv_heads=self.num_kv_heads,
...@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] ...@@ -197,7 +197,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
seq_lens=seq_lens, seq_lens=seq_lens,
block_table=block_table_tensor, block_table=block_table_tensor,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
scheduler_metadata=sheduler_metadata, scheduler_metadata=scheduler_metadata,
causal=causal, causal=causal,
use_sdpa_prefill=self.use_sdpa_prefill, use_sdpa_prefill=self.use_sdpa_prefill,
num_decode_tokens=num_decode_tokens, num_decode_tokens=num_decode_tokens,
......
...@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): ...@@ -383,7 +383,7 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
# Return a tensor of shape (#requests, #max blocks) # Return a tensor of shape (#requests, #max blocks)
state_indices_tensor = common_attn_metadata.block_table_tensor state_indices_tensor = common_attn_metadata.block_table_tensor
# Additional cache-related varaiables: # Additional cache-related variables:
mamba_block_size = self.kv_cache_spec.block_size mamba_block_size = self.kv_cache_spec.block_size
( (
block_idx_last_computed_token, block_idx_last_computed_token,
......
...@@ -49,14 +49,14 @@ if TYPE_CHECKING: ...@@ -49,14 +49,14 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
# For FP8 sparse attention we have two impelementations: # For FP8 sparse attention we have two implementations:
# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is # 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is
# done by treating all tokens as single batch. # done by treating all tokens as single batch.
# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill # 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill
# (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using # (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using
# the FP8 decode kernel for decode. # the FP8 decode kernel for decode.
# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16 # Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16
# prefill kernel requires padding the numer of heads to 128 while the decode does not # prefill kernel requires padding the number of heads to 128 while the decode does not
# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed # so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed
# batch mode (#2). # batch mode (#2).
MIN_HEADS_FOR_BF16_PREFILL = 32 MIN_HEADS_FOR_BF16_PREFILL = 32
...@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend): ...@@ -126,7 +126,7 @@ class FlashMLASparseBackend(AttentionBackend):
cache_dtype_str: str = "auto", cache_dtype_str: str = "auto",
) -> tuple[int, ...]: ) -> tuple[int, ...]:
if cache_dtype_str == "fp8_ds_mla": if cache_dtype_str == "fp8_ds_mla":
# custom storage fromat is 656 bytes # custom storage format is 656 bytes
# see FlashMLA readme.md for details # see FlashMLA readme.md for details
return (num_blocks, block_size, 656) return (num_blocks, block_size, 656)
else: else:
......
...@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata: ...@@ -370,7 +370,7 @@ class AiterFlashAttentionMetadata:
slot_mapping: torch.Tensor slot_mapping: torch.Tensor
block_table: torch.Tensor block_table: torch.Tensor
# prefill and deocde split # prefill and decode split
num_decodes: int num_decodes: int
num_decode_tokens: int num_decode_tokens: int
num_prefills: int num_prefills: int
...@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl): ...@@ -1099,7 +1099,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
extend_tokens_slice = slice( extend_tokens_slice = slice(
num_decode_tokens, num_decode_tokens + num_extend_tokens num_decode_tokens, num_decode_tokens + num_extend_tokens
) )
extend_querys = query[extend_tokens_slice] extend_queries = query[extend_tokens_slice]
extend_keys = key[extend_tokens_slice] extend_keys = key[extend_tokens_slice]
extend_values = value[extend_tokens_slice] extend_values = value[extend_tokens_slice]
extend_outputs = output[extend_tokens_slice] extend_outputs = output[extend_tokens_slice]
...@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl): ...@@ -1110,7 +1110,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
v_scale = attn_metadata.v_scale v_scale = attn_metadata.v_scale
self.extend_forward( self.extend_forward(
attn_metadata=attn_metadata, attn_metadata=attn_metadata,
query=extend_querys, query=extend_queries,
key=extend_keys, key=extend_keys,
value=extend_values, value=extend_values,
key_cache=key_cache, key_cache=key_cache,
......
...@@ -863,7 +863,7 @@ class MambaManager(SingleTypeKVCacheManager): ...@@ -863,7 +863,7 @@ class MambaManager(SingleTypeKVCacheManager):
): ):
# Mamba can't rely on blocks generated by other requests in the current step # Mamba can't rely on blocks generated by other requests in the current step
# To put it in the next step, we return num_gpu_blocks + 1 so # To put it in the next step, we return num_gpu_blocks + 1 so
# that kv_cache_manager will think there is no enough blocks to allocte now # that kv_cache_manager will think there is no enough blocks to allocate now
# and don't schedule it in the current step. # and don't schedule it in the current step.
return self.block_pool.num_gpu_blocks + 1 return self.block_pool.num_gpu_blocks + 1
if self.mamba_cache_mode != "align": if self.mamba_cache_mode != "align":
......
...@@ -1724,11 +1724,11 @@ class DPEngineCoreProc(EngineCoreProc): ...@@ -1724,11 +1724,11 @@ class DPEngineCoreProc(EngineCoreProc):
""" """
Send notifications to EngineCoreClient, which can then forward Send notifications to EngineCoreClient, which can then forward
the notifications to other engine core processes. It is used for: the notifications to other engine core processes. It is used for:
1) In scale up: new core engines to notify exisiting core engines 1) In scale up: new core engines to notify existing core engines
that they are ready; that they are ready;
2) In scale down: removing core engines to notify EngineCoreClient 2) In scale down: removing core engines to notify EngineCoreClient
so EngineCoreClient can release their ray placement groups; so EngineCoreClient can release their ray placement groups;
3) Both scale up/down: to notify EngineCoreClient that exisiting 3) Both scale up/down: to notify EngineCoreClient that existing
core engines have already switched to the new parallel setup. core engines have already switched to the new parallel setup.
""" """
if vllm_config is None: if vllm_config is None:
......
...@@ -194,7 +194,7 @@ class InputProcessor: ...@@ -194,7 +194,7 @@ class InputProcessor:
@staticmethod @staticmethod
def assign_request_id(request: EngineCoreRequest): def assign_request_id(request: EngineCoreRequest):
"""Replace the externally supplied request ID with an internal request ID """Replace the externally supplied request ID with an internal request ID
that adds 8 random characters in order to ensure uniquness. that adds 8 random characters in order to ensure uniqueness.
""" """
if request.external_req_id is not None: if request.external_req_id is not None:
raise ValueError( raise ValueError(
......
...@@ -197,7 +197,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler): ...@@ -197,7 +197,7 @@ class SingleDirectionOffloadingHandler(OffloadingHandler):
transfer = self._transfers.popleft() transfer = self._transfers.popleft()
transfer_time = ( transfer_time = (
transfer.start_event.elapsed_time(transfer.end_event) * 1e-3 transfer.start_event.elapsed_time(transfer.end_event) * 1e-3
) # elapsed_time is in miliseconds ) # elapsed_time is in milliseconds
result = TransferResult( result = TransferResult(
job_id=transfer.job_id, job_id=transfer.job_id,
success=True, success=True,
......
...@@ -905,7 +905,7 @@ class GPUModelRunner( ...@@ -905,7 +905,7 @@ class GPUModelRunner(
Args: Args:
scheduler_output: The scheduler output. scheduler_output: The scheduler output.
""" """
# Attention free models have zero kv_cache_goups, however models # Attention free models have zero kv_cache_groups, however models
# like Mamba are also attention free but use the kv_cache for # like Mamba are also attention free but use the kv_cache for
# keeping its internal state. This is why we check the number # keeping its internal state. This is why we check the number
# of kv_cache groups instead of solely checking # of kv_cache groups instead of solely checking
...@@ -1065,7 +1065,7 @@ class GPUModelRunner( ...@@ -1065,7 +1065,7 @@ class GPUModelRunner(
# of the request. for example: # of the request. for example:
# fist step: num_computed_tokens = 0, spec_tokens = [], # fist step: num_computed_tokens = 0, spec_tokens = [],
# prev_num_draft_len = 0. # prev_num_draft_len = 0.
# second step: num_computed_tokens = 100(prompt lenth), # second step: num_computed_tokens = 100(prompt length),
# spec_tokens = [a,b], prev_num_draft_len = 0. # spec_tokens = [a,b], prev_num_draft_len = 0.
# third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
# prev_num_draft_len = 2. # prev_num_draft_len = 2.
...@@ -1412,30 +1412,30 @@ class GPUModelRunner( ...@@ -1412,30 +1412,30 @@ class GPUModelRunner(
prev_draft_token_indices.extend(range(start, start + draft_len)) prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index) max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices) num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec: if num_common_tokens < total_without_spec:
# If not all requests are decodes from the last iteration, # If not all requests are decodes from the last iteration,
# We need to copy the input_ids_cpu to the GPU first. # We need to copy the input_ids_cpu to the GPU first.
self.input_ids.copy_to_gpu(total_num_scheduled_tokens) self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds: if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0: if num_common_tokens == 0:
# No requests in common with the previous iteration # No requests in common with the previous iteration
# So input_ids.cpu will have all the input ids. # So input_ids.cpu will have all the input ids.
return return
if indices_match and max_flattened_index == (num_commmon_tokens - 1): if indices_match and max_flattened_index == (num_common_tokens - 1):
# Common-case optimization: the batch is unchanged # Common-case optimization: the batch is unchanged
# and no reordering happened. # and no reordering happened.
# The indices are both the same permutation of 0..N-1 so # The indices are both the same permutation of 0..N-1 so
# we can copy directly using a single slice. # we can copy directly using a single slice.
self.input_ids.gpu[:num_commmon_tokens].copy_( self.input_ids.gpu[:num_common_tokens].copy_(
self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0], self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0],
non_blocking=True, non_blocking=True,
) )
if self.enable_prompt_embeds: if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True self.is_token_ids.gpu[:num_common_tokens] = True
return return
# Upload the index tensors asynchronously so the scatter can be non-blocking. # Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor( sampled_tokens_index_tensor = torch.tensor(
...@@ -4383,7 +4383,7 @@ class GPUModelRunner( ...@@ -4383,7 +4383,7 @@ class GPUModelRunner(
self.model.compile(fullgraph=True, backend=backend) self.model.compile(fullgraph=True, backend=backend)
return return
# for other compilation modes, cudagraph behavior is controlled by # for other compilation modes, cudagraph behavior is controlled by
# CudagraphWraper and CudagraphDispatcher of vllm. # CudagraphWrapper and CudagraphDispatcher of vllm.
# wrap the model with full cudagraph wrapper if needed. # wrap the model with full cudagraph wrapper if needed.
cudagraph_mode = self.compilation_config.cudagraph_mode cudagraph_mode = self.compilation_config.cudagraph_mode
...@@ -4444,7 +4444,7 @@ class GPUModelRunner( ...@@ -4444,7 +4444,7 @@ class GPUModelRunner(
:param weights_path: path to load weights from if weights_iterator is not :param weights_path: path to load weights from if weights_iterator is not
provided. Use path of original model if neither is provided. provided. Use path of original model if neither is provided.
:param is_checkpoint_format: set to False if weights have already been processed :param is_checkpoint_format: set to False if weights have already been processed
into kernel format (repacking, renaming, ect.) into kernel format (repacking, renaming, etc.)
""" """
# TODO(@kylesayrs): generalize to all runners and loaders # TODO(@kylesayrs): generalize to all runners and loaders
# argument validation # argument validation
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment