Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
...@@ -3106,7 +3106,7 @@ def cpu_attn_get_scheduler_metadata( ...@@ -3106,7 +3106,7 @@ def cpu_attn_get_scheduler_metadata(
isa: str, isa: str,
enable_kv_split: bool, enable_kv_split: bool,
) -> torch.Tensor: ) -> torch.Tensor:
sheduler_metadata = torch.ops._C.get_scheduler_metadata( scheduler_metadata = torch.ops._C.get_scheduler_metadata(
num_reqs, num_reqs,
num_heads, num_heads,
num_kv_heads, num_kv_heads,
...@@ -3119,7 +3119,7 @@ def cpu_attn_get_scheduler_metadata( ...@@ -3119,7 +3119,7 @@ def cpu_attn_get_scheduler_metadata(
isa, isa,
enable_kv_split, enable_kv_split,
) )
return sheduler_metadata return scheduler_metadata
def cpu_attn_reshape_and_cache( def cpu_attn_reshape_and_cache(
......
...@@ -872,7 +872,7 @@ class CompilationConfig: ...@@ -872,7 +872,7 @@ class CompilationConfig:
) )
# Currently only eager and inductor backend are supported. # Currently only eager and inductor backend are supported.
# for piecewise compilation. Custom backends are not suppported for # for piecewise compilation. Custom backends are not supported for
# piecewise compilation. Update when more backends are supported. # piecewise compilation. Update when more backends are supported.
if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [ if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
"", "",
......
...@@ -59,7 +59,7 @@ class ObservabilityConfig: ...@@ -59,7 +59,7 @@ class ObservabilityConfig:
enable_layerwise_nvtx_tracing: bool = False enable_layerwise_nvtx_tracing: bool = False
"""Enable layerwise NVTX tracing. This traces the execution of each layer or """Enable layerwise NVTX tracing. This traces the execution of each layer or
module in the model and attach informations such as input/output shapes to module in the model and attach information such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.""" nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
enable_mfu_metrics: bool = False enable_mfu_metrics: bool = False
......
...@@ -592,7 +592,7 @@ class VllmConfig: ...@@ -592,7 +592,7 @@ class VllmConfig:
If the user configuration does not specify a value for a default field If the user configuration does not specify a value for a default field
and if the default field is still None after all user selections are and if the default field is still None after all user selections are
applied, then default values will be applied to the field. User speciied applied, then default values will be applied to the field. User specified
fields will not be overridden by the default. fields will not be overridden by the default.
Args: Args:
......
...@@ -44,7 +44,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy): ...@@ -44,7 +44,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
rank_in_pack = np.zeros_like(pack_index, dtype=np.int64) rank_in_pack = np.zeros_like(pack_index, dtype=np.int64)
return pack_index, rank_in_pack return pack_index, rank_in_pack
# Sort and get indices in decending order # Sort and get indices in descending order
indices = np.argsort(-weight, axis=-1) indices = np.argsort(-weight, axis=-1)
pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64) pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64)
......
...@@ -129,7 +129,7 @@ class KVConnectorRole(enum.Enum): ...@@ -129,7 +129,7 @@ class KVConnectorRole(enum.Enum):
class KVConnectorHandshakeMetadata(ABC): # noqa: B024 class KVConnectorHandshakeMetadata(ABC): # noqa: B024
""" """
Metadata used for out of band connector handshake between Metadata used for out of band connector handshake between
P/D workers. This needs to serializeable. P/D workers. This needs to serializable.
""" """
pass pass
......
...@@ -398,7 +398,7 @@ class ReqMeta: ...@@ -398,7 +398,7 @@ class ReqMeta:
) )
def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig): def need_gpu_interim_buffer(lmcache_config: LMCacheEngineConfig):
return not lmcache_config.enable_pd return not lmcache_config.enable_pd
...@@ -497,7 +497,7 @@ def _init_lmcache_engine( ...@@ -497,7 +497,7 @@ def _init_lmcache_engine(
use_mla, use_mla,
) )
use_gpu = need_gpu_interm_buffer(lmcache_config) use_gpu = need_gpu_interim_buffer(lmcache_config)
vllm_gpu_connector: ( vllm_gpu_connector: (
VLLMBufferLayerwiseGPUConnector VLLMBufferLayerwiseGPUConnector
| VLLMPagedMemGPUConnectorV2 | VLLMPagedMemGPUConnectorV2
......
...@@ -481,7 +481,7 @@ class MooncakeConnectorWorker: ...@@ -481,7 +481,7 @@ class MooncakeConnectorWorker:
) )
self._remote_agents: dict[EngineId, dict[int, dict[int, str]]] = {} self._remote_agents: dict[EngineId, dict[int, dict[int, str]]] = {}
self._pending_bootstrap_querys: dict[str, asyncio.Event] = {} self._pending_bootstrap_queries: dict[str, asyncio.Event] = {}
self.side_channel_port: int = 0 # we will bind it in register_kv_caches() self.side_channel_port: int = 0 # we will bind it in register_kv_caches()
self.engine_id: EngineId = engine_id self.engine_id: EngineId = engine_id
self.tp_rank = get_tensor_model_parallel_rank() self.tp_rank = get_tensor_model_parallel_rank()
...@@ -1077,7 +1077,7 @@ class MooncakeConnectorWorker: ...@@ -1077,7 +1077,7 @@ class MooncakeConnectorWorker:
response = self._xfer_resp_decoder.decode(ret_msg) response = self._xfer_resp_decoder.decode(ret_msg)
if response.status == MooncakeXferResponseStatus.ERROR: if response.status == MooncakeXferResponseStatus.ERROR:
logger.error( logger.error(
"Error happens during tranfering kvcache for %s: %s", "Error happens during transferring kvcache for %s: %s",
req_ids, req_ids,
response.err_msg, response.err_msg,
) )
...@@ -1140,8 +1140,8 @@ class MooncakeConnectorWorker: ...@@ -1140,8 +1140,8 @@ class MooncakeConnectorWorker:
) )
# Always notify others regardless of connection success or failure. # Always notify others regardless of connection success or failure.
self._pending_bootstrap_querys[remote_bootstrap_addr].set() self._pending_bootstrap_queries[remote_bootstrap_addr].set()
del self._pending_bootstrap_querys[remote_bootstrap_addr] del self._pending_bootstrap_queries[remote_bootstrap_addr]
def receive_kv( def receive_kv(
self, self,
...@@ -1171,11 +1171,11 @@ class MooncakeConnectorWorker: ...@@ -1171,11 +1171,11 @@ class MooncakeConnectorWorker:
pull_metas: dict[ReqId, PullReqMeta], pull_metas: dict[ReqId, PullReqMeta],
): ):
remote_bootstrap_addr = next(iter(pull_metas.values())).remote_bootstrap_addr remote_bootstrap_addr = next(iter(pull_metas.values())).remote_bootstrap_addr
if remote_bootstrap_addr not in self._pending_bootstrap_querys: if remote_bootstrap_addr not in self._pending_bootstrap_queries:
self._pending_bootstrap_querys[remote_bootstrap_addr] = asyncio.Event() self._pending_bootstrap_queries[remote_bootstrap_addr] = asyncio.Event()
await self._connect_to_prefiller_bootstrap(remote_bootstrap_addr) await self._connect_to_prefiller_bootstrap(remote_bootstrap_addr)
else: else:
await self._pending_bootstrap_querys[remote_bootstrap_addr].wait() await self._pending_bootstrap_queries[remote_bootstrap_addr].wait()
if remote_engine_id not in self._remote_agents: if remote_engine_id not in self._remote_agents:
logger.error( logger.error(
......
...@@ -720,7 +720,7 @@ class OffloadPromMetrics(KVConnectorPromMetrics): ...@@ -720,7 +720,7 @@ class OffloadPromMetrics(KVConnectorPromMetrics):
per_engine_labelvalues: dict[int, list[object]], per_engine_labelvalues: dict[int, list[object]],
): ):
super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues) super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
# (engine_idx, transfer_tupe) -> (metric with bounded labels) # (engine_idx, transfer_type) -> (metric with bounded labels)
self.histogram_transfer_size: dict[tuple[int, str], PromMetricT] = {} self.histogram_transfer_size: dict[tuple[int, str], PromMetricT] = {}
self.counter_kv_bytes: dict[tuple[int, str], PromMetricT] = {} self.counter_kv_bytes: dict[tuple[int, str], PromMetricT] = {}
self.counter_kv_transfer_time: dict[tuple[int, str], PromMetricT] = {} self.counter_kv_transfer_time: dict[tuple[int, str], PromMetricT] = {}
......
...@@ -1647,9 +1647,9 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1647,9 +1647,9 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: in streaming, we noticed this bug: # TODO: in streaming, we noticed this bug:
# https://github.com/vllm-project/vllm/issues/25697 # https://github.com/vllm-project/vllm/issues/25697
await self._initialize_tool_sessions(request, context, exit_stack) await self._initialize_tool_sessions(request, context, exit_stack)
processer = self._process_harmony_streaming_events processor = self._process_harmony_streaming_events
else: else:
processer = self._process_simple_streaming_events processor = self._process_simple_streaming_events
# TODO Hanchen make sampling params to include the structural tag # TODO Hanchen make sampling params to include the structural tag
initial_response = ResponsesResponse.from_request( initial_response = ResponsesResponse.from_request(
...@@ -1677,7 +1677,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -1677,7 +1677,7 @@ class OpenAIServingResponses(OpenAIServing):
) )
try: try:
async for event_data in processer( async for event_data in processor(
request, request,
sampling_params, sampling_params,
result_generator, result_generator,
......
...@@ -1520,7 +1520,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1520,7 +1520,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024") os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
), ),
# Force DeepEP to use intranode kernel for inter-node communication in # Force DeepEP to use intranode kernel for inter-node communication in
# high throughput mode. This is useful archive higher prefill throuhgput # high throughput mode. This is useful archive higher prefill throughput
# on system supports multi-node nvlink (e.g GB200). # on system supports multi-node nvlink (e.g GB200).
"VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool( "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0")) int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0"))
......
...@@ -175,7 +175,7 @@ class DPMetadata: ...@@ -175,7 +175,7 @@ class DPMetadata:
# Get the cumulative tokens across sequence parallel ranks. # Get the cumulative tokens across sequence parallel ranks.
# In this case the input to the MoEs will be distributed w.r.t both # In this case the input to the MoEs will be distributed w.r.t both
# DP and TP rank. # DP and TP rank.
# When sp_size==1, this is just the cummulative num tokens across DP. # When sp_size==1, this is just the cumulative num tokens across DP.
def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor: def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
num_tokens_across_sp_cpu = ( num_tokens_across_sp_cpu = (
self.num_tokens_across_dp_cpu - 1 + sp_size self.num_tokens_across_dp_cpu - 1 + sp_size
......
...@@ -57,10 +57,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): ...@@ -57,10 +57,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
input_parallel = input_ input_parallel = input_
else: else:
# TODO: simplify code below # TODO: simplify code below
splitted_input = split_tensor_along_last_dim( split_input = split_tensor_along_last_dim(
input_, num_partitions=self.tp_size input_, num_partitions=self.tp_size
) )
input_parallel = splitted_input[self.tp_rank].contiguous() input_parallel = split_input[self.tp_rank].contiguous()
# Matrix multiply. # Matrix multiply.
bias_ = ( bias_ = (
......
...@@ -11,7 +11,7 @@ from vllm.lora.lora_weights import LoRALayerWeights ...@@ -11,7 +11,7 @@ from vllm.lora.lora_weights import LoRALayerWeights
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import ( from vllm.lora.utils import (
get_lora_id, get_lora_id,
is_base_embeddding_weights, is_base_embedding_weights,
parse_fine_tuned_lora_name, parse_fine_tuned_lora_name,
) )
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
...@@ -86,7 +86,7 @@ class LoRAModel: ...@@ -86,7 +86,7 @@ class LoRAModel:
pin_memory = str(device) == "cpu" and is_pin_memory_available() pin_memory = str(device) == "cpu" and is_pin_memory_available()
loras: dict[str, LoRALayerWeights] = {} loras: dict[str, LoRALayerWeights] = {}
for tensor_name, tensor in tensors.items(): for tensor_name, tensor in tensors.items():
if is_base_embeddding_weights(tensor_name): if is_base_embedding_weights(tensor_name):
continue continue
# Skip modules based on model-defined prefixes (e.g., MTP layers) # Skip modules based on model-defined prefixes (e.g., MTP layers)
if skip_prefixes and cls._should_skip_module(tensor_name, skip_prefixes): if skip_prefixes and cls._should_skip_module(tensor_name, skip_prefixes):
...@@ -162,7 +162,7 @@ class LoRAModel: ...@@ -162,7 +162,7 @@ class LoRAModel:
def check_unexpected_modules(modules: dict): def check_unexpected_modules(modules: dict):
for lora_module in modules.keys(): # noqa for lora_module in modules.keys(): # noqa
if is_base_embeddding_weights(lora_module): if is_base_embedding_weights(lora_module):
continue continue
# Handle PEFT file format where experts.base_layer is the # Handle PEFT file format where experts.base_layer is the
# gate_up_proj and experts is the down_proj # gate_up_proj and experts is the down_proj
......
...@@ -193,7 +193,7 @@ def parse_fine_tuned_lora_name( ...@@ -193,7 +193,7 @@ def parse_fine_tuned_lora_name(
raise ValueError(f"{name} is unsupported LoRA weight") raise ValueError(f"{name} is unsupported LoRA weight")
def is_base_embeddding_weights(name: str) -> bool: def is_base_embedding_weights(name: str) -> bool:
# hardcoded subfixes for input & output embedding weights # hardcoded subfixes for input & output embedding weights
embedding_suffixes = ( embedding_suffixes = (
".embed_tokens.base_layer.weight", ".embed_tokens.base_layer.weight",
......
...@@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel): ...@@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel):
weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous() weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1) weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1)
# make 16 output channel as a block and transpose to the make # make 16 output channel as a block and transpose to the make
# the block contigous # the block contiguous
weight = ( weight = (
weight.view(input_size, -1, 16 // pack_factor) weight.view(input_size, -1, 16 // pack_factor)
.permute(1, 0, 2) .permute(1, 0, 2)
......
...@@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
) )
# workspace # workspace
# |------- N tokens --------|--------- N*dcp_size tokens ----------| # |------- N tokens --------|--------- N*dcp_size tokens ----------|
# |<- use for loca_gather ->|<--------- use for allgather -------->| # |<- use for local_gather ->|<--------- use for allgather -------->|
allgather_offset = workspace.shape[0] // (dcp_world_size + 1) allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0] assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0]
assert toks <= allgather_offset assert toks <= allgather_offset
......
...@@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): ...@@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None: def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
# No support for LoRA in flashinfer_cutlass_fused_moe. # No support for LoRA in flashinfer_cutlass_fused_moe.
# See TODOs in flashinfer functions runMoe and runMoeMinLantency. # See TODOs in flashinfer functions runMoe and runMoeMinLatency.
raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe") raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
...@@ -409,7 +409,7 @@ def batched_fused_marlin_moe( ...@@ -409,7 +409,7 @@ def batched_fused_marlin_moe(
Note that the moe_align_block_size function indicates, Note that the moe_align_block_size function indicates,
- What rows of the A matrix (hidden_states) to access during the - What rows of the A matrix (hidden_states) to access during the
matmul, via sorted_ids output. matmul, via sorted_ids output.
- What expert_id to use for each block matmul, via expert_ids ouptut. - What expert_id to use for each block matmul, via expert_ids output.
In the batched version, the tokens are already grouped/batched by experts In the batched version, the tokens are already grouped/batched by experts
they subscribe to. Due to this, we can represent the batched hidden_states they subscribe to. Due to this, we can represent the batched hidden_states
......
...@@ -606,7 +606,7 @@ class FusedMoEExperts(ABC): ...@@ -606,7 +606,7 @@ class FusedMoEExperts(ABC):
""" """
Whether the kernel supports deployment in particular parallel config. Whether the kernel supports deployment in particular parallel config.
Can be overriden if a kernel does not support EP, SP or some other Can be overridden if a kernel does not support EP, SP or some other
configuration. configuration.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -620,7 +620,7 @@ class FusedMoEExperts(ABC): ...@@ -620,7 +620,7 @@ class FusedMoEExperts(ABC):
""" """
Whether the kernel supports a routing method (e.g. GroupedTopK). Whether the kernel supports a routing method (e.g. GroupedTopK).
Can be overriden by monolithic kernels that execute the router Can be overridden by monolithic kernels that execute the router
in addition to the experts if certain routers are not supported. in addition to the experts if certain routers are not supported.
""" """
return True return True
...@@ -633,7 +633,7 @@ class FusedMoEExperts(ABC): ...@@ -633,7 +633,7 @@ class FusedMoEExperts(ABC):
""" """
Whether a kernel supports a particular dtype for router logits input. Whether a kernel supports a particular dtype for router logits input.
Can be overriden by monolithic kernels that execute the router Can be overridden by monolithic kernels that execute the router
in addition to the experts if certain dtypes are not supported. in addition to the experts if certain dtypes are not supported.
""" """
return True return True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment